diff options
author | Karl Williamson <khw@cpan.org> | 2015-06-21 21:38:32 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2015-07-28 22:15:53 -0600 |
commit | 9b63e895ee43cb3717f72cde64f6a658b1b46dd7 (patch) | |
tree | bf381dfa72b0e8c5bd5c89b393f635405b0189a6 | |
parent | ce6e23cf44eba8e2f28aeef6eafc77ef073a33a9 (diff) | |
download | perl-9b63e895ee43cb3717f72cde64f6a658b1b46dd7.tar.gz |
There are no folds to multiple chars in early Unicode versions
Several places require special handling because of this, notably for the
lowercase Sharp S, but not in Unicodes before 3.0.1
-rw-r--r-- | charclass_invlists.h | 2 | ||||
-rw-r--r-- | lib/unicore/mktables | 3 | ||||
-rw-r--r-- | pp.c | 6 | ||||
-rw-r--r-- | regcharclass.h | 4 | ||||
-rw-r--r-- | regcomp.c | 43 | ||||
-rw-r--r-- | regen/regcharclass_multi_char_folds.pl | 2 | ||||
-rw-r--r-- | utf8.c | 4 |
7 files changed, 53 insertions, 11 deletions
diff --git a/charclass_invlists.h b/charclass_invlists.h index e08c668efd..f8a413ef3b 100644 --- a/charclass_invlists.h +++ b/charclass_invlists.h @@ -99521,7 +99521,7 @@ static const UV XPosixXDigit_invlist[] = { /* for EBCDIC POSIX-BC */ * 1a0687fb9c6c4567e853913549df0944fe40821279a3e9cdaa6ab8679bc286fd lib/unicore/extracted/DLineBreak.txt * 40bcfed3ca727c19e1331f6c33806231d5f7eeeabd2e6a9e06a3740c85d0c250 lib/unicore/extracted/DNumType.txt * a18d502bad39d527ac5586d7bc93e29f565859e3bcc24ada627eff606d6f5fed lib/unicore/extracted/DNumValues.txt - * d047d201914bceead70e2d1ffbe4847c3e15cd52325e7e8f58b8a719e26fd3ee lib/unicore/mktables + * 234024b47cb9fd57ae95b60cd4e9087feea586b70c1243b84141534d1bca2918 lib/unicore/mktables * 462c9aaa608fb2014cd9649af1c5c009485c60b9c8b15b89401fdc10cf6161c6 lib/unicore/version * c6884f4d629f04d1316f3476cb1050b6a1b98ca30c903262955d4eae337c6b1e regen/charset_translations.pl * f199f92c0b5f87882b0198936ea8ef3dc43627b57a77ac3eb9250bd2664bbd88 regen/mk_invlists.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index fdcb0fff6b..2f6fc5ff2f 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -13989,6 +13989,9 @@ sub compile_perl() { Description => "Code points whose fold is a string of more than one character", ); + if ($v_version lt v3.0.1) { + push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char'; + } # Look through all the known folds to populate these tables. foreach my $range ($cf->ranges) { @@ -4401,8 +4401,14 @@ PP(pp_fc) const U8 *send; U8 *d; U8 tmpbuf[UTF8_MAXBYTES_CASE + 1]; +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) const bool full_folding = TRUE; /* This variable is here so we can easily move to more generality later */ +#else + const bool full_folding = FALSE; +#endif const U8 flags = ( full_folding ? FOLD_FLAGS_FULL : 0 ) #ifdef USE_LOCALE_CTYPE | ( IN_LC_RUNTIME(LC_CTYPE) ? FOLD_FLAGS_LOCALE : 0 ) diff --git a/regcharclass.h b/regcharclass.h index 32a2b4b213..a9409afdd7 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -2514,9 +2514,9 @@ * 1a0687fb9c6c4567e853913549df0944fe40821279a3e9cdaa6ab8679bc286fd lib/unicore/extracted/DLineBreak.txt * 40bcfed3ca727c19e1331f6c33806231d5f7eeeabd2e6a9e06a3740c85d0c250 lib/unicore/extracted/DNumType.txt * a18d502bad39d527ac5586d7bc93e29f565859e3bcc24ada627eff606d6f5fed lib/unicore/extracted/DNumValues.txt - * d047d201914bceead70e2d1ffbe4847c3e15cd52325e7e8f58b8a719e26fd3ee lib/unicore/mktables + * 234024b47cb9fd57ae95b60cd4e9087feea586b70c1243b84141534d1bca2918 lib/unicore/mktables * 462c9aaa608fb2014cd9649af1c5c009485c60b9c8b15b89401fdc10cf6161c6 lib/unicore/version * c6884f4d629f04d1316f3476cb1050b6a1b98ca30c903262955d4eae337c6b1e regen/charset_translations.pl * 5e47f645eac3a918246254e19c06b604c8ea088cf62da5be84dcb953ef2bf16c regen/regcharclass.pl - * 206b60035ff0cec9f7d1701937ecf9226a943faa42dfc4827c37306be64ff18e regen/regcharclass_multi_char_folds.pl + * 393f8d882713a3ba227351ad0f00ea4839fda74fcf77dcd1cdf31519925adba5 regen/regcharclass_multi_char_folds.pl * ex: set ro: */ @@ -3652,6 +3652,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, * this function, we need to flag any occurrences of the sharp s. * This character forbids trie formation (because of added * complexity) */ +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) while (s < s_end) { if (*s == LATIN_SMALL_LETTER_SHARP_S) { OP(scan) = EXACTFA_NO_TRIE; @@ -3704,6 +3707,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, *min_subtract += len - 1; s += len; } +#endif } } @@ -11504,8 +11508,13 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, *character = (U8) code_point; len = 1; } /* Else is folded non-UTF8 */ +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) { - +#else + else if (1) { +#endif /* We don't fold any non-UTF8 except possibly the Sharp s (see * comments at join_exact()); */ *character = (U8) code_point; @@ -11549,9 +11558,13 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, /* A single character node is SIMPLE, except for the special-cased SHARP S * under /di. */ if ((len == 1 || (UTF && len == UNISKIP(code_point))) - && (code_point != LATIN_SMALL_LETTER_SHARP_S - || ! FOLD || ! DEPENDS_SEMANTICS)) - { +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) + && ( code_point != LATIN_SMALL_LETTER_SHARP_S + || ! FOLD || ! DEPENDS_SEMANTICS) +#endif + ) { *flagp |= SIMPLE; } @@ -12649,11 +12662,15 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } else /* A regular FOLD code point */ if (! ( UTF +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) /* See comments for join_exact() as to why we fold this * non-UTF at compile time */ || (node_type == EXACTFU - && ender == LATIN_SMALL_LETTER_SHARP_S))) - { + && ender == LATIN_SMALL_LETTER_SHARP_S) +#endif + )) { /* Here, are folding and are not UTF-8 encoded; therefore * the character must be in the range 0-255, and is not /l * (Not /l because we already handled these under /l in @@ -12666,11 +12683,15 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * 'ss' */ if (maybe_exactfu && (PL_fold[ender] != PL_fold_latin1[ender] +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) || ender == LATIN_SMALL_LETTER_SHARP_S || (len > 0 && isALPHA_FOLD_EQ(ender, 's') - && isALPHA_FOLD_EQ(*(s-1), 's')))) - { + && isALPHA_FOLD_EQ(*(s-1), 's')) +#endif + )) { maybe_exactfu = FALSE; } } @@ -14214,6 +14235,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, DEBUG_PARSE("clas"); +#if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \ + && UNICODE_DOT_DOT_VERSION == 0) + allow_multi_folds = FALSE; +#endif + /* Assume we are going to generate an ANYOF node. */ ret = reganode(pRExC_state, (LOC) diff --git a/regen/regcharclass_multi_char_folds.pl b/regen/regcharclass_multi_char_folds.pl index dfc8f9f3e5..5ea9d33a25 100644 --- a/regen/regcharclass_multi_char_folds.pl +++ b/regen/regcharclass_multi_char_folds.pl @@ -63,6 +63,8 @@ sub multi_char_folds ($) { # multi-char folds; false if just the ones that # are all ascii + return () if pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) lt v3.0.1; + my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding"); die "Could not find inversion map for Case_Folding" unless defined $format; die "Incorrect format '$format' for Case_Folding inversion map" @@ -1540,6 +1540,9 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int f if (c == MICRO_SIGN) { converted = GREEK_SMALL_LETTER_MU; } +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) { /* If can't cross 127/128 boundary, can't return "ss"; instead return @@ -1558,6 +1561,7 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int f return 's'; } } +#endif else { /* In this range the fold of all other characters is their lower case */ converted = toLOWER_LATIN1(c); |