summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2014-04-28 20:55:50 -0600
committerKarl Williamson <khw@cpan.org>2014-05-30 10:33:02 -0600
commit8954b91a52258523bade4624c06e76d720ed31ae (patch)
tree9a5b6b54a3d4a01eb6da9e9b8d421f62edafe6b1
parent4f3da64afef7a66791ba9b5f80dd8a1c044604a6 (diff)
downloadperl-8954b91a52258523bade4624c06e76d720ed31ae.tar.gz
regcomp.c: Move code into a function
This is in preparation for it to be called from another place
-rw-r--r--embed.fnc2
-rw-r--r--embed.h1
-rw-r--r--lib/diagnostics.t2
-rw-r--r--pod/perldiag.pod2
-rw-r--r--proto.h6
-rw-r--r--regcomp.c143
6 files changed, 82 insertions, 74 deletions
diff --git a/embed.fnc b/embed.fnc
index 3510fed976..cb2d7b1382 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -2070,6 +2070,8 @@ Es |regnode*|regclass |NN RExC_state_t *pRExC_state \
|bool allow_multi_fold \
|const bool silence_non_portable \
|NULLOK SV** ret_invlist
+Es |void|add_above_Latin1_folds|NN RExC_state_t *pRExC_state|const U8 cp \
+ |NN SV** invlist
Es |bool|could_it_be_a_POSIX_class|NN RExC_state_t *pRExC_state
Es |regnode*|handle_regex_sets|NN RExC_state_t *pRExC_state \
|NULLOK SV ** return_invlist \
diff --git a/embed.h b/embed.h
index 481bdac9bc..0e33affbc4 100644
--- a/embed.h
+++ b/embed.h
@@ -907,6 +907,7 @@
# if defined(PERL_IN_REGCOMP_C)
#define _append_range_to_invlist(a,b,c) S__append_range_to_invlist(aTHX_ a,b,c)
#define _invlist_array_init(a,b) S__invlist_array_init(aTHX_ a,b)
+#define add_above_Latin1_folds(a,b,c) S_add_above_Latin1_folds(aTHX_ a,b,c)
#define add_cp_to_invlist(a,b) S_add_cp_to_invlist(aTHX_ a,b)
#define add_data S_add_data
#define alloc_maybe_populate_EXACT(a,b,c,d,e,f) S_alloc_maybe_populate_EXACT(aTHX_ a,b,c,d,e,f)
diff --git a/lib/diagnostics.t b/lib/diagnostics.t
index 8868edabed..7c04342735 100644
--- a/lib/diagnostics.t
+++ b/lib/diagnostics.t
@@ -112,7 +112,7 @@ like $warning,
# ; at end of entry in perldiag.pod
seek STDERR, 0,0;
$warning = '';
-warn "Perl folding rules are not up-to-date for 0xA; please use the perlbug utility to report; in regex; marked by <-- HERE in m/\ <-- HERE q/";
+warn "Perl folding rules are not up-to-date for 0x0A; please use the perlbug utility to report; in regex; marked by <-- HERE in m/\ <-- HERE q/";
like $warning,
qr/You used a regular expression with case-insensitive matching/s,
'; works at the end of entries in perldiag.pod';
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 6c02d65e37..0ccf5fb20d 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -4147,7 +4147,7 @@ redirected it with select().)
"Can't locate object method \"%s\" via package \"%s\"". It often means
that a method requires a package that has not been loaded.
-=item Perl folding rules are not up-to-date for 0x%X; please use the perlbug
+=item Perl folding rules are not up-to-date for 0x%X; please use the perlbug
utility to report; in regex; marked by S<<-- HERE> in m/%s/
(S regexp) You used a regular expression with case-insensitive matching,
diff --git a/proto.h b/proto.h
index f5793cc892..06a31e38fe 100644
--- a/proto.h
+++ b/proto.h
@@ -6642,6 +6642,12 @@ PERL_STATIC_INLINE UV* S__invlist_array_init(pTHX_ SV* const invlist, const bool
#define PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT \
assert(invlist)
+STATIC void S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
+ __attribute__nonnull__(pTHX_1)
+ __attribute__nonnull__(pTHX_3);
+#define PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS \
+ assert(pRExC_state); assert(invlist)
+
PERL_STATIC_INLINE SV* S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp)
__attribute__warn_unused_result__;
diff --git a/regcomp.c b/regcomp.c
index a6e1818984..693d044e43 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -13119,6 +13119,74 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
}
#undef IS_OPERAND
+STATIC void
+S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
+{
+ /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
+ * innocent-looking character class, like /[ks]/i won't have to go out to
+ * disk to find the possible matches.
+ *
+ * This should be called only for a Latin1-range code points, cp, which is
+ * known to be involved in a fold with other code points above Latin1. It
+ * would give false results if /aa has been specified. Multi-char folds
+ * are outside the scope of this, and must be handled specially.
+ *
+ * XXX It would be better to generate these via regen, in case a new
+ * version of the Unicode standard adds new mappings, though that is not
+ * really likely, and may be caught by the default: case of the switch
+ * below. */
+
+ PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
+
+ switch (cp) {
+ case 'k':
+ case 'K':
+ *invlist =
+ add_cp_to_invlist(*invlist, KELVIN_SIGN);
+ break;
+ case 's':
+ case 'S':
+ *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
+ break;
+ case MICRO_SIGN:
+ *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
+ *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
+ break;
+ case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
+ case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
+ *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
+ break;
+ case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
+ *invlist = add_cp_to_invlist(*invlist,
+ LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
+ break;
+ case LATIN_SMALL_LETTER_SHARP_S:
+ *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
+ break;
+ case 'F': case 'f':
+ case 'I': case 'i':
+ case 'L': case 'l':
+ case 'T': case 't':
+ case 'A': case 'a':
+ case 'H': case 'h':
+ case 'J': case 'j':
+ case 'N': case 'n':
+ case 'W': case 'w':
+ case 'Y': case 'y':
+ /* These all are targets of multi-character folds from code points
+ * that require UTF8 to express, so they can't match unless the
+ * target string is in UTF-8, so no action here is necessary, as
+ * regexec.c properly handles the general case for UTF-8 matching
+ * and multi-char folds */
+ break;
+ default:
+ /* Use deprecated warning to increase the chances of this being
+ * output */
+ ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
+ break;
+ }
+}
+
/* The names of properties whose definitions are not known at compile time are
* stored in this SV, after a constant heading. So if the length has been
* changed since initialization, then there is a run-time definition. */
@@ -14338,15 +14406,6 @@ parseit:
if (j < 256) {
- /* We have the latin1 folding rules hard-coded here so
- * that an innocent-looking character class, like
- * /[ks]/i won't have to go out to disk to find the
- * possible matches. XXX It would be better to
- * generate these via regen, in case a new version of
- * the Unicode standard adds new mappings, though that
- * is not really likely, and may be caught by the
- * default: case of the switch below. */
-
if (IS_IN_SOME_FOLD_L1(j)) {
/* ASCII is always matched; non-ASCII is matched
@@ -14366,69 +14425,9 @@ parseit:
if (HAS_NONLATIN1_FOLD_CLOSURE(j)
&& (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
{
- /* Certain Latin1 characters have matches outside
- * Latin1. To get here, <j> is one of those
- * characters. None of these matches is valid for
- * ASCII characters under /aa, which is why the 'if'
- * just above excludes those. These matches only
- * happen when the target string is utf8. The code
- * below adds the single fold closures for <j> to the
- * inversion list. */
-
- switch (j) {
- case 'k':
- case 'K':
- *use_list =
- add_cp_to_invlist(*use_list, KELVIN_SIGN);
- break;
- case 's':
- case 'S':
- *use_list = add_cp_to_invlist(*use_list,
- LATIN_SMALL_LETTER_LONG_S);
- break;
- case MICRO_SIGN:
- *use_list = add_cp_to_invlist(*use_list,
- GREEK_CAPITAL_LETTER_MU);
- *use_list = add_cp_to_invlist(*use_list,
- GREEK_SMALL_LETTER_MU);
- break;
- case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
- case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
- *use_list =
- add_cp_to_invlist(*use_list, ANGSTROM_SIGN);
- break;
- case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
- *use_list = add_cp_to_invlist(*use_list,
- LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
- break;
- case LATIN_SMALL_LETTER_SHARP_S:
- *use_list = add_cp_to_invlist(*use_list,
- LATIN_CAPITAL_LETTER_SHARP_S);
- break;
- case 'F': case 'f':
- case 'I': case 'i':
- case 'L': case 'l':
- case 'T': case 't':
- case 'A': case 'a':
- case 'H': case 'h':
- case 'J': case 'j':
- case 'N': case 'n':
- case 'W': case 'w':
- case 'Y': case 'y':
- /* These all are targets of multi-character
- * folds from code points that require UTF8
- * to express, so they can't match unless
- * the target string is in UTF-8, so no
- * action here is necessary, as regexec.c
- * properly handles the general case for
- * UTF-8 matching and multi-char folds */
- break;
- default:
- /* Use deprecated warning to increase the
- * chances of this being output */
- ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
- break;
- }
+ add_above_Latin1_folds(pRExC_state,
+ (U8) j,
+ use_list);
}
continue;
}