summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2015-05-19 13:25:21 -0600
committerKarl Williamson <khw@cpan.org>2015-07-28 22:15:51 -0600
commit0766489e84dfe5c9f36282765a12fb565e2e8dbc (patch)
tree2864a896d3a47903fe9c4b7d47055fd86c8fe6ed
parent8f57fa7d32a4c3ae3d1497b23c7dd74a0feeaec1 (diff)
downloadperl-0766489e84dfe5c9f36282765a12fb565e2e8dbc.tar.gz
Allow Perl to compile and work on Unicode releases without U+1E9E
U+1E9E LATIN CAPITAL LETTER SHARP S is handled specially by Perl, because of its relationship to the infamous LATIN SMALL LETTER SHARP S, which folds to 'ss', being the only character whose code point is < 256 to have a multi char fold, and this creates lots of special cases. But U+1E9E wasn't in all Unicode releases. Because Perl is supposed to work with any release, we need to be able to compile when this character isn't defined. In some of those cases we use U+017F (LATIN SMALL LETTER LONG S instead, which is in all releases.
-rw-r--r--regcomp.c6
-rw-r--r--regexec.c2
-rw-r--r--utf8.c18
-rw-r--r--utf8.h4
4 files changed, 22 insertions, 8 deletions
diff --git a/regcomp.c b/regcomp.c
index fcaf153ef6..476ace0576 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -14014,9 +14014,15 @@ S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invl
*invlist = add_cp_to_invlist(*invlist,
LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
break;
+
+#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
+
case LATIN_SMALL_LETTER_SHARP_S:
*invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
break;
+
+#endif
+
default:
/* Use deprecated warning to increase the chances of this being
* output */
diff --git a/regexec.c b/regexec.c
index ec4ed861ab..0ea2aa070c 100644
--- a/regexec.c
+++ b/regexec.c
@@ -192,7 +192,7 @@ static const char* const non_utf8_target_but_utf8_required
PL_utf8_swash_ptrs[_CC_WORDCHAR], \
"", \
PL_XPosix_ptrs[_CC_WORDCHAR], \
- LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
+ LATIN_SMALL_LIGATURE_LONG_S_T_UTF8);
#define PLACEHOLDER /* Something for the preprocessor to grab onto */
/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
diff --git a/utf8.c b/utf8.c
index cbff7a7888..591174621e 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2200,11 +2200,13 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
if (flags & FOLD_FLAGS_LOCALE) {
-# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
# define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
+ const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1;
+
+# ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
+# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1;
- const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1;
/* Special case these two characters, as what normally gets
* returned under locale doesn't work */
@@ -2217,7 +2219,9 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
"resolved to \"\\x{17F}\\x{17F}\".");
goto return_long_s;
}
- else if (UTF8SKIP(p) == long_s_t_len
+ else
+#endif
+ if (UTF8SKIP(p) == long_s_t_len
&& memEQ((char *) p, LONG_S_T, long_s_t_len))
{
/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
@@ -2249,9 +2253,11 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
/* But in these instances, there is an alternative we can
* return that is valid */
- if (original == LATIN_CAPITAL_LETTER_SHARP_S
- || original == LATIN_SMALL_LETTER_SHARP_S)
- {
+ if (original == LATIN_SMALL_LETTER_SHARP_S
+#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
+ || original == LATIN_CAPITAL_LETTER_SHARP_S
+#endif
+ ) {
goto return_long_s;
}
else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
diff --git a/utf8.h b/utf8.h
index 3e15707ff8..235d15c480 100644
--- a/utf8.h
+++ b/utf8.h
@@ -611,7 +611,9 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
#define GREEK_CAPITAL_LETTER_MU 0x039C /* Upper and title case
of MICRON */
#define LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS 0x0178 /* Also is title case */
-#define LATIN_CAPITAL_LETTER_SHARP_S 0x1E9E
+#ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
+# define LATIN_CAPITAL_LETTER_SHARP_S 0x1E9E
+#endif
#define LATIN_SMALL_LETTER_LONG_S 0x017F
#define LATIN_SMALL_LIGATURE_LONG_S_T 0xFB05
#define LATIN_SMALL_LIGATURE_ST 0xFB06