regcomp.c: Move code into a function

This is in preparation for it to be called from another place
author: Karl Williamson <khw@cpan.org> 2014-04-28 20:55:50 -0600
committer: Karl Williamson <khw@cpan.org> 2014-05-30 10:33:02 -0600
commit: 8954b91a52258523bade4624c06e76d720ed31ae (patch)
tree: 9a5b6b54a3d4a01eb6da9e9b8d421f62edafe6b1
parent: 4f3da64afef7a66791ba9b5f80dd8a1c044604a6 (diff)
download: perl-8954b91a52258523bade4624c06e76d720ed31ae.tar.gz
6 files changed, 82 insertions, 74 deletions
diff --git a/embed.fnc b/embed.fnc
index 3510fed976..cb2d7b1382 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -2070,6 +2070,8 @@ Es	|regnode*|regclass	|NN RExC_state_t *pRExC_state \
 				|bool allow_multi_fold                        \
 				|const bool silence_non_portable	      \
 				|NULLOK SV** ret_invlist
+Es	|void|add_above_Latin1_folds|NN RExC_state_t *pRExC_state|const U8 cp \
+				|NN SV** invlist
 Es	|bool|could_it_be_a_POSIX_class|NN RExC_state_t *pRExC_state
 Es	|regnode*|handle_regex_sets|NN RExC_state_t *pRExC_state \
 				|NULLOK SV ** return_invlist            \
diff --git a/embed.h b/embed.h
index 481bdac9bc..0e33affbc4 100644
--- a/embed.h
+++ b/embed.h
@@ -907,6 +907,7 @@
 #  if defined(PERL_IN_REGCOMP_C)
 #define _append_range_to_invlist(a,b,c)	S__append_range_to_invlist(aTHX_ a,b,c)
 #define _invlist_array_init(a,b)	S__invlist_array_init(aTHX_ a,b)
+#define add_above_Latin1_folds(a,b,c)	S_add_above_Latin1_folds(aTHX_ a,b,c)
 #define add_cp_to_invlist(a,b)	S_add_cp_to_invlist(aTHX_ a,b)
 #define add_data		S_add_data
 #define alloc_maybe_populate_EXACT(a,b,c,d,e,f)	S_alloc_maybe_populate_EXACT(aTHX_ a,b,c,d,e,f)
diff --git a/lib/diagnostics.t b/lib/diagnostics.t
index 8868edabed..7c04342735 100644
--- a/lib/diagnostics.t
+++ b/lib/diagnostics.t
@@ -112,7 +112,7 @@ like $warning,
 # ; at end of entry in perldiag.pod
 seek STDERR, 0,0;
 $warning = '';
-warn "Perl folding rules are not up-to-date for 0xA; please use the perlbug utility to report; in regex; marked by <-- HERE in m/\ <-- HERE q/";
+warn "Perl folding rules are not up-to-date for 0x0A; please use the perlbug utility to report; in regex; marked by <-- HERE in m/\ <-- HERE q/";
 like $warning,
     qr/You used a regular expression with case-insensitive matching/s,
     '; works at the end of entries in perldiag.pod';
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 6c02d65e37..0ccf5fb20d 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -4147,7 +4147,7 @@ redirected it with select().)
 "Can't locate object method \"%s\" via package \"%s\"".  It often means
 that a method requires a package that has not been loaded.
 
-=item Perl folding rules are not up-to-date for 0x%X; please use the perlbug 
+=item Perl folding rules are not up-to-date for 0x%X; please use the perlbug
 utility to report; in regex; marked by S<<-- HERE> in m/%s/
 
 (S regexp) You used a regular expression with case-insensitive matching,
diff --git a/proto.h b/proto.h
index f5793cc892..06a31e38fe 100644
--- a/proto.h
+++ b/proto.h
@@ -6642,6 +6642,12 @@ PERL_STATIC_INLINE UV*	S__invlist_array_init(pTHX_ SV* const invlist, const bool
 #define PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT	\
 	assert(invlist)
 
+STATIC void	S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
+			__attribute__nonnull__(pTHX_1)
+			__attribute__nonnull__(pTHX_3);
+#define PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS	\
+	assert(pRExC_state); assert(invlist)
+
 PERL_STATIC_INLINE SV*	S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp)
 			__attribute__warn_unused_result__;
 
diff --git a/regcomp.c b/regcomp.c
index a6e1818984..693d044e43 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -13119,6 +13119,74 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
 }
 #undef IS_OPERAND
 
+STATIC void
+S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
+{
+    /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
+     * innocent-looking character class, like /[ks]/i won't have to go out to
+     * disk to find the possible matches.
+     *
+     * This should be called only for a Latin1-range code points, cp, which is
+     * known to be involved in a fold with other code points above Latin1.  It
+     * would give false results if /aa has been specified.  Multi-char folds
+     * are outside the scope of this, and must be handled specially.
+     *
+     * XXX It would be better to generate these via regen, in case a new
+     * version of the Unicode standard adds new mappings, though that is not
+     * really likely, and may be caught by the default: case of the switch
+     * below. */
+
+    PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
+
+    switch (cp) {
+        case 'k':
+        case 'K':
+          *invlist =
+             add_cp_to_invlist(*invlist, KELVIN_SIGN);
+            break;
+        case 's':
+        case 'S':
+          *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
+            break;
+        case MICRO_SIGN:
+          *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
+          *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
+            break;
+        case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
+        case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
+          *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
+            break;
+        case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
+          *invlist = add_cp_to_invlist(*invlist,
+                                        LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
+            break;
+        case LATIN_SMALL_LETTER_SHARP_S:
+          *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
+            break;
+        case 'F': case 'f':
+        case 'I': case 'i':
+        case 'L': case 'l':
+        case 'T': case 't':
+        case 'A': case 'a':
+        case 'H': case 'h':
+        case 'J': case 'j':
+        case 'N': case 'n':
+        case 'W': case 'w':
+        case 'Y': case 'y':
+            /* These all are targets of multi-character folds from code points
+             * that require UTF8 to express, so they can't match unless the
+             * target string is in UTF-8, so no action here is necessary, as
+             * regexec.c properly handles the general case for UTF-8 matching
+             * and multi-char folds */
+            break;
+        default:
+            /* Use deprecated warning to increase the chances of this being
+             * output */
+            ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
+            break;
+    }
+}
+
 /* The names of properties whose definitions are not known at compile time are
  * stored in this SV, after a constant heading.  So if the length has been
  * changed since initialization, then there is a run-time definition. */
@@ -14338,15 +14406,6 @@ parseit:
 
                     if (j < 256) {
 
-                        /* We have the latin1 folding rules hard-coded here so
-                         * that an innocent-looking character class, like
-                         * /[ks]/i won't have to go out to disk to find the
-                         * possible matches.  XXX It would be better to
-                         * generate these via regen, in case a new version of
-                         * the Unicode standard adds new mappings, though that
-                         * is not really likely, and may be caught by the
-                         * default: case of the switch below. */
-
                         if (IS_IN_SOME_FOLD_L1(j)) {
 
                             /* ASCII is always matched; non-ASCII is matched
@@ -14366,69 +14425,9 @@ parseit:
                         if (HAS_NONLATIN1_FOLD_CLOSURE(j)
                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
                         {
-                            /* Certain Latin1 characters have matches outside
-                            * Latin1.  To get here, <j> is one of those
-                            * characters.   None of these matches is valid for
-                            * ASCII characters under /aa, which is why the 'if'
-                            * just above excludes those.  These matches only
-                            * happen when the target string is utf8.  The code
-                            * below adds the single fold closures for <j> to the
-                            * inversion list. */
-
-                            switch (j) {
-                                case 'k':
-                                case 'K':
-                                  *use_list =
-                                     add_cp_to_invlist(*use_list, KELVIN_SIGN);
-                                    break;
-                                case 's':
-                                case 'S':
-                                  *use_list = add_cp_to_invlist(*use_list,
-                                                    LATIN_SMALL_LETTER_LONG_S);
-                                    break;
-                                case MICRO_SIGN:
-                                  *use_list = add_cp_to_invlist(*use_list,
-                                                      GREEK_CAPITAL_LETTER_MU);
-                                  *use_list = add_cp_to_invlist(*use_list,
-                                                        GREEK_SMALL_LETTER_MU);
-                                    break;
-                                case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
-                                case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
-                                  *use_list =
-                                   add_cp_to_invlist(*use_list, ANGSTROM_SIGN);
-                                    break;
-                                case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
-                                  *use_list = add_cp_to_invlist(*use_list,
-                                        LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
-                                    break;
-                                case LATIN_SMALL_LETTER_SHARP_S:
-                                  *use_list = add_cp_to_invlist(*use_list,
-                                                 LATIN_CAPITAL_LETTER_SHARP_S);
-                                    break;
-                                case 'F': case 'f':
-                                case 'I': case 'i':
-                                case 'L': case 'l':
-                                case 'T': case 't':
-                                case 'A': case 'a':
-                                case 'H': case 'h':
-                                case 'J': case 'j':
-                                case 'N': case 'n':
-                                case 'W': case 'w':
-                                case 'Y': case 'y':
-                                    /* These all are targets of multi-character
-                                     * folds from code points that require UTF8
-                                     * to express, so they can't match unless
-                                     * the target string is in UTF-8, so no
-                                     * action here is necessary, as regexec.c
-                                     * properly handles the general case for
-                                     * UTF-8 matching and multi-char folds */
-                                    break;
-                                default:
-                                    /* Use deprecated warning to increase the
-                                    * chances of this being output */
-                                    ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
-                                    break;
-                            }
+                            add_above_Latin1_folds(pRExC_state,
+                                                   (U8) j,
+                                                   use_list);
                         }
                         continue;
                     }
author	Karl Williamson <khw@cpan.org>	2014-04-28 20:55:50 -0600
committer	Karl Williamson <khw@cpan.org>	2014-05-30 10:33:02 -0600
commit	8954b91a52258523bade4624c06e76d720ed31ae (patch)
tree	9a5b6b54a3d4a01eb6da9e9b8d421f62edafe6b1
parent	4f3da64afef7a66791ba9b5f80dd8a1c044604a6 (diff)
download	perl-8954b91a52258523bade4624c06e76d720ed31ae.tar.gz