diff options
author | Karl Williamson <khw@cpan.org> | 2015-05-19 13:25:21 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2015-07-28 22:15:51 -0600 |
commit | 0766489e84dfe5c9f36282765a12fb565e2e8dbc (patch) | |
tree | 2864a896d3a47903fe9c4b7d47055fd86c8fe6ed | |
parent | 8f57fa7d32a4c3ae3d1497b23c7dd74a0feeaec1 (diff) | |
download | perl-0766489e84dfe5c9f36282765a12fb565e2e8dbc.tar.gz |
Allow Perl to compile and work on Unicode releases without U+1E9E
U+1E9E LATIN CAPITAL LETTER SHARP S is handled specially by Perl,
because of its relationship to the infamous LATIN SMALL LETTER SHARP S,
which folds to 'ss', being the only character whose code point is < 256
to have a multi char fold, and this creates lots of special cases.
But U+1E9E wasn't in all Unicode releases. Because Perl is supposed to
work with any release, we need to be able to compile when this character
isn't defined. In some of those cases we use U+017F (LATIN SMALL LETTER
LONG S instead, which is in all releases.
-rw-r--r-- | regcomp.c | 6 | ||||
-rw-r--r-- | regexec.c | 2 | ||||
-rw-r--r-- | utf8.c | 18 | ||||
-rw-r--r-- | utf8.h | 4 |
4 files changed, 22 insertions, 8 deletions
@@ -14014,9 +14014,15 @@ S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invl *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); break; + +#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */ + case LATIN_SMALL_LETTER_SHARP_S: *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S); break; + +#endif + default: /* Use deprecated warning to increase the chances of this being * output */ @@ -192,7 +192,7 @@ static const char* const non_utf8_target_but_utf8_required PL_utf8_swash_ptrs[_CC_WORDCHAR], \ "", \ PL_XPosix_ptrs[_CC_WORDCHAR], \ - LATIN_CAPITAL_LETTER_SHARP_S_UTF8); + LATIN_SMALL_LIGATURE_LONG_S_T_UTF8); #define PLACEHOLDER /* Something for the preprocessor to grab onto */ /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */ @@ -2200,11 +2200,13 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) if (flags & FOLD_FLAGS_LOCALE) { -# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8 # define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8 + const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1; + +# ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8 +# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8 const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1; - const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1; /* Special case these two characters, as what normally gets * returned under locale doesn't work */ @@ -2217,7 +2219,9 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) "resolved to \"\\x{17F}\\x{17F}\"."); goto return_long_s; } - else if (UTF8SKIP(p) == long_s_t_len + else +#endif + if (UTF8SKIP(p) == long_s_t_len && memEQ((char *) p, LONG_S_T, long_s_t_len)) { /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ @@ -2249,9 +2253,11 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) /* But in these instances, there is an alternative we can * return that is valid */ - if (original == LATIN_CAPITAL_LETTER_SHARP_S - || original == LATIN_SMALL_LETTER_SHARP_S) - { + if (original == LATIN_SMALL_LETTER_SHARP_S +#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */ + || original == LATIN_CAPITAL_LETTER_SHARP_S +#endif + ) { goto return_long_s; } else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) { @@ -611,7 +611,9 @@ case any call to string overloading updates the internal UTF-8 encoding flag. #define GREEK_CAPITAL_LETTER_MU 0x039C /* Upper and title case of MICRON */ #define LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS 0x0178 /* Also is title case */ -#define LATIN_CAPITAL_LETTER_SHARP_S 0x1E9E +#ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8 +# define LATIN_CAPITAL_LETTER_SHARP_S 0x1E9E +#endif #define LATIN_SMALL_LETTER_LONG_S 0x017F #define LATIN_SMALL_LIGATURE_LONG_S_T 0xFB05 #define LATIN_SMALL_LIGATURE_ST 0xFB06 |