summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2018-11-06 17:31:21 -0700
committerKarl Williamson <khw@cpan.org>2018-11-16 10:48:18 -0700
commite2fd06413d9a59faa671e09a6c1b3259b8f406e3 (patch)
tree1d8fc17d9d7da13ee9b7d8a34a6986e36a74853e
parent401135611bab46c53f48203f950ce631f2f05308 (diff)
downloadperl-e2fd06413d9a59faa671e09a6c1b3259b8f406e3.tar.gz
regcomp.c: Refactor constructing EXACTish nodes
The previous commits have allowed us to refactor this to eliminate redundancies. Previously, the same logic was done separately for UTF-8 and non-UTF-8 patterns. This refactors so the logic is done once. The details differ for UTF-8 and non-UTF-8. So that's where the differences lie, in the details without having to duplicate the logic.
-rw-r--r--regcomp.c198
1 files changed, 73 insertions, 125 deletions
diff --git a/regcomp.c b/regcomp.c
index ffe4bec1c5..7f6bbc81b6 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -14138,156 +14138,104 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
* character, and wait until runtime to fold it */
goto not_fold_common;
}
- else /* A regular FOLD code point */
- if (! UTF)
+ else /* regular fold; see if actually is in a fold */
+ if ( (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
+ || (ender > 255
+ && ! _invlist_contains_cp(PL_utf8_foldable, ender)))
{
- /* Here, are folding and are not UTF-8 encoded; therefore
- * the character must be in the range 0-255, and is not /l.
- * (Not /l because we already handled these under /l in
- * is_PROBLEMATIC_LOCALE_FOLD_cp) */
- if (! IS_IN_SOME_FOLD_L1(ender)) {
-
- /* Start a new node for this non-folding character if
- * previous ones in the node were folded */
- if (len && node_type != EXACT) {
- p = oldp;
- goto loopdone;
- }
+ /* Here, folding, but the character isn't in a fold.
+ *
+ * Start a new node if previous characters in the node were
+ * folded */
+ if (len && node_type != EXACT) {
+ p = oldp;
+ goto loopdone;
+ }
+ /* Here, continuing a node with non-folded characters. Add
+ * this one */
+
+ if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
*(s++) = (char) ender;
}
- else { /* Here, does participate in some fold */
-
- /* if this is the first character in the node, change
- * its type to folding. Otherwise, if this is the
- * first folding character in the node, close up the
- * existing node, so can start a new node with this
- * one. */
- if (! len) {
- node_type = compute_EXACTish(pRExC_state);
+ else {
+ s = (char *) uvchr_to_utf8((U8 *) s, ender);
+ added_len = UVCHR_SKIP(ender);
+ }
+ }
+ else { /* Here, does participate in some fold */
+
+ /* If this is the first character in the node, change its
+ * type to folding. Otherwise, if this is the first
+ * folding character in the node, close up the existing
+ * node, so can start a new node with this one. */
+ if (! len) {
+ node_type = compute_EXACTish(pRExC_state);
+ }
+ else if (node_type == EXACT) {
+ p = oldp;
+ goto loopdone;
+ }
+
+ if (UTF) { /* For UTF-8, we add the folded value */
+ if (UVCHR_IS_INVARIANT(ender)) {
+ *(s)++ = (U8) toFOLD(ender);
}
- else if (node_type == EXACT) {
- p = oldp;
- goto loopdone;
+ else {
+ ender = _to_uni_fold_flags(
+ ender,
+ (U8 *) s,
+ &added_len,
+ FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0));
+ s += added_len;
}
+ }
+ else {
- /* See if the character's fold differs between /d and
- * /u. On non-ancient Unicode versions, this includes
- * the multi-char fold SHARP S to 'ss' */
+ /* Here is non-UTF8; we don't normally store the folded
+ * value. First, see if the character's fold differs
+ * between /d and /u. */
+ if (PL_fold[ender] != PL_fold_latin1[ender]) {
+ maybe_exactfu = FALSE;
+ }
#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
|| (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
|| UNICODE_DOT_DOT_VERSION > 0)
- if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
+ /* On non-ancient Unicode versions, this includes the
+ * multi-char fold SHARP S to 'ss' */
- /* See comments for join_exact() as to why we fold
- * this non-UTF at compile time */
- if (node_type == EXACTFU) {
- *(s++) = 's';
+ else if (UNLIKELY( ender == LATIN_SMALL_LETTER_SHARP_S
+ || ( len
+ && isALPHA_FOLD_EQ(ender, 's')
+ && isALPHA_FOLD_EQ(*(s-1), 's'))))
+ {
- /* Let the code below add in the extra 's' */
- ender = 's';
- added_len = 2;
+ if (node_type == EXACTFU) {
+ /* See comments for join_exact() as to why we
+ * fold this non-UTF at compile time */
+ if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
+ *(s++) = 's';
+
+ /* Let the code below add in the extra 's' */
+ ender = 's';
+ added_len = 2;
+ }
}
- else if (! RExC_uni_semantics) {
+ else {
maybe_exactfu = FALSE;
}
}
- else if ( len
- && isALPHA_FOLD_EQ(ender, 's')
- && isALPHA_FOLD_EQ(*(s-1), 's'))
- {
- maybe_exactfu = FALSE;
- }
- else
#endif
- if (PL_fold[ender] != PL_fold_latin1[ender]) {
- maybe_exactfu = FALSE;
- }
-
/* Even when folding, we store just the input
* character, as we have an array that finds its fold
* quickly */
*(s++) = (char) ender;
}
- }
- else { /* FOLD, and UTF */
- /* Unlike the non-fold case, we do actually have to
- * calculate the fold in pass 1. This is for two reasons,
- * the folded length may be longer than the unfolded, and
- * we have to calculate how many EXACTish nodes it will
- * take; and we may run out of room in a node in the middle
- * of a potential multi-char fold, and have to back off
- * accordingly. */
-
- if (isASCII_uni(ender)) {
-
- /* As above, we close up and start a new node if the
- * previous characters don't match the fold/non-fold
- * state of this one. And if this is the first
- * character in the node, and it folds, we change the
- * node away from being EXACT */
- if (! IS_IN_SOME_FOLD_L1(ender)) {
- if (len && node_type != EXACT) {
- p = oldp;
- goto loopdone;
- }
-
- *(s)++ = (U8) ender;
- }
- else { /* Is in a fold */
-
- if (! len) {
- node_type = compute_EXACTish(pRExC_state);
- }
- else if (node_type == EXACT) {
- p = oldp;
- goto loopdone;
- }
-
- *(s)++ = (U8) toFOLD(ender);
- }
- }
- else { /* Not ASCII */
- STRLEN foldlen;
-
- /* As above, we close up and start a new node if the
- * previous characters don't match the fold/non-fold
- * state of this one. And if this is the first
- * character in the node, and it folds, we change the
- * node away from being EXACT */
- if (! _invlist_contains_cp(PL_utf8_foldable, ender)) {
- if (len && node_type != EXACT) {
- p = oldp;
- goto loopdone;
- }
-
- s = (char *) uvchr_to_utf8((U8 *) s, ender);
- added_len = UVCHR_SKIP(ender);
- }
- else {
-
- if (! len) {
- node_type = compute_EXACTish(pRExC_state);
- }
- else if (node_type == EXACT) {
- p = oldp;
- goto loopdone;
- }
-
- ender = _to_uni_fold_flags(
- ender,
- (U8 *) s,
- &foldlen,
- FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
- ? FOLD_FLAGS_NOMIX_ASCII
- : 0));
- s += foldlen;
- added_len = foldlen;
- }
- }
} /* End of adding current character to the node */
len += added_len;