summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--embed.fnc3
-rw-r--r--embed.h3
-rw-r--r--handy.h8
-rw-r--r--proto.h9
-rw-r--r--utf8.c144
5 files changed, 155 insertions, 12 deletions
diff --git a/embed.fnc b/embed.fnc
index c7816d531c..9d4a8461f5 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1811,6 +1811,9 @@ s |UV |_to_utf8_case |const UV uv1 \
|NULLOK const unsigned int * const * const aux_tables \
|NULLOK const U8 * const aux_table_lengths \
|NN const char * const normal
+s |UV |turkic_fc |NN const U8 * const p |NN const U8 * const e|NN U8* ustrp|NN STRLEN *lenp
+s |UV |turkic_lc |NN const U8 * const p0|NN const U8 * const e|NN U8* ustrp|NN STRLEN *lenp
+s |UV |turkic_uc |NN const U8 * const p |NN const U8 * const e|NN U8* ustrp|NN STRLEN *lenp
#endif
ApbmdD |UV |to_utf8_lower |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp
AMp |UV |_to_utf8_lower_flags|NN const U8 *p|NULLOK const U8* e \
diff --git a/embed.h b/embed.h
index 149f1bee25..4df6fa0b0f 100644
--- a/embed.h
+++ b/embed.h
@@ -2081,6 +2081,9 @@
#define swash_scan_list_line(a,b,c,d,e,f,g) S_swash_scan_list_line(aTHX_ a,b,c,d,e,f,g)
#define swatch_get(a,b,c) S_swatch_get(aTHX_ a,b,c)
#define to_lower_latin1 S_to_lower_latin1
+#define turkic_fc(a,b,c,d) S_turkic_fc(aTHX_ a,b,c,d)
+#define turkic_lc(a,b,c,d) S_turkic_lc(aTHX_ a,b,c,d)
+#define turkic_uc(a,b,c,d) S_turkic_uc(aTHX_ a,b,c,d)
#define unexpected_non_continuation_text(a,b,c,d) S_unexpected_non_continuation_text(aTHX_ a,b,c,d)
#define warn_on_first_deprecated_use(a,b,c,d,e) S_warn_on_first_deprecated_use(aTHX_ a,b,c,d,e)
# endif
diff --git a/handy.h b/handy.h
index 954b9caa30..57ad62dc61 100644
--- a/handy.h
+++ b/handy.h
@@ -1542,7 +1542,7 @@ END_EXTERN_C
/* These next three are also for internal core Perl use only: case-change
* helper macros. The reason for using the PL_latin arrays is in case the
* system function is defective; it ensures uniform results that conform to the
- * Unicod standard. */
+ * Unicod standard. It does not handle the anomalies in UTF-8 Turkic locales */
#define _generic_toLOWER_LC(c, function, cast) (! FITS_IN_8_BITS(c) \
? (c) \
: (IN_UTF8_CTYPE_LOCALE) \
@@ -1553,7 +1553,8 @@ END_EXTERN_C
* returns a single value, so can't adequately return the upper case of LATIN
* SMALL LETTER SHARP S in a UTF-8 locale (which should be a string of two
* values "SS"); instead it asserts against that under DEBUGGING, and
- * otherwise returns its input */
+ * otherwise returns its input. It does not handle the anomalies in UTF-8
+ * Turkic locales. */
#define _generic_toUPPER_LC(c, function, cast) \
(! FITS_IN_8_BITS(c) \
? (c) \
@@ -1571,7 +1572,8 @@ END_EXTERN_C
* returns a single value, so can't adequately return the fold case of LATIN
* SMALL LETTER SHARP S in a UTF-8 locale (which should be a string of two
* values "ss"); instead it asserts against that under DEBUGGING, and
- * otherwise returns its input */
+ * otherwise returns its input. It does not handle the anomalies in UTF-8
+ * Turkic locales */
#define _generic_toFOLD_LC(c, function, cast) \
((UNLIKELY((c) == MICRO_SIGN) && IN_UTF8_CTYPE_LOCALE) \
? GREEK_SMALL_LETTER_MU \
diff --git a/proto.h b/proto.h
index ba5623d4a2..adf1ef5d40 100644
--- a/proto.h
+++ b/proto.h
@@ -6243,6 +6243,15 @@ STATIC SV* S_swatch_get(pTHX_ SV* swash, UV start, UV span)
STATIC U8 S_to_lower_latin1(const U8 c, U8 *p, STRLEN *lenp, const char dummy)
__attribute__warn_unused_result__;
+STATIC UV S_turkic_fc(pTHX_ const U8 * const p, const U8 * const e, U8* ustrp, STRLEN *lenp);
+#define PERL_ARGS_ASSERT_TURKIC_FC \
+ assert(p); assert(e); assert(ustrp); assert(lenp)
+STATIC UV S_turkic_lc(pTHX_ const U8 * const p0, const U8 * const e, U8* ustrp, STRLEN *lenp);
+#define PERL_ARGS_ASSERT_TURKIC_LC \
+ assert(p0); assert(e); assert(ustrp); assert(lenp)
+STATIC UV S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e, U8* ustrp, STRLEN *lenp);
+#define PERL_ARGS_ASSERT_TURKIC_UC \
+ assert(p); assert(e); assert(ustrp); assert(lenp)
STATIC char * S_unexpected_non_continuation_text(pTHX_ const U8 * const s, STRLEN print_len, const STRLEN non_cont_byte_pos, const STRLEN expect_len)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT \
diff --git a/utf8.c b/utf8.c
index fc4a0c1dce..94b68017a2 100644
--- a/utf8.c
+++ b/utf8.c
@@ -3077,10 +3077,10 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
if (flags & FOLD_FLAGS_LOCALE) {
- /* Treat a UTF-8 locale as not being in locale at all, except for
- * potentially warning */
+ /* Treat a non-Turkic UTF-8 locale as not being in locale at all,
+ * except for potentially warning */
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
- if (IN_UTF8_CTYPE_LOCALE) {
+ if (IN_UTF8_CTYPE_LOCALE && ! PL_in_utf8_turkic_locale) {
flags &= ~FOLD_FLAGS_LOCALE;
}
else {
@@ -3716,6 +3716,119 @@ S_check_and_deprecate(pTHX_ const U8 *p,
return utf8n_flags;
}
+STATIC UV
+S_turkic_fc(pTHX_ const U8 * const p, const U8 * const e,
+ U8 * ustrp, STRLEN *lenp)
+{
+ /* Returns 0 if the foldcase of the input UTF-8 encoded sequence from
+ * p0..e-1 according to Turkic rules is the same as for non-Turkic.
+ * Otherwise, it returns the first code point of the Turkic foldcased
+ * sequence, and the entire sequence will be stored in *ustrp. ustrp will
+ * contain *lenp bytes
+ *
+ * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
+ * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
+ * DOTLESS I */
+
+ PERL_ARGS_ASSERT_TURKIC_FC;
+ assert(e > p);
+
+ if (UNLIKELY(*p == 'I')) {
+ *lenp = 2;
+ ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I);
+ ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I);
+ return LATIN_SMALL_LETTER_DOTLESS_I;
+ }
+
+ if (UNLIKELY(memBEGINs(p, e - p,
+ LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)))
+ {
+ *lenp = 1;
+ *ustrp = 'i';
+ return 'i';
+ }
+
+ return 0;
+}
+
+STATIC UV
+S_turkic_lc(pTHX_ const U8 * const p0, const U8 * const e,
+ U8 * ustrp, STRLEN *lenp)
+{
+ /* Returns 0 if the lowercase of the input UTF-8 encoded sequence from
+ * p0..e-1 according to Turkic rules is the same as for non-Turkic.
+ * Otherwise, it returns the first code point of the Turkic lowercased
+ * sequence, and the entire sequence will be stored in *ustrp. ustrp will
+ * contain *lenp bytes */
+
+ PERL_ARGS_ASSERT_TURKIC_LC;
+ assert(e > p0);
+
+ /* A 'I' requires context as to what to do */
+ if (UNLIKELY(*p0 == 'I')) {
+ const U8 * p = p0 + 1;
+
+ /* According to the Unicode SpecialCasing.txt file, a capital 'I'
+ * modified by a dot above lowercases to 'i' even in turkic locales. */
+ while (p < e) {
+ UV cp;
+
+ if (memBEGINs(p, e - p, COMBINING_DOT_ABOVE_UTF8)) {
+ ustrp[0] = 'i';
+ *lenp = 1;
+ return 'i';
+ }
+
+ /* For the dot above to modify the 'I', it must be part of a
+ * combining sequence immediately following the 'I', and no other
+ * modifier with a ccc of 230 may intervene */
+ cp = utf8_to_uvchr_buf(p, e, NULL);
+ if (! _invlist_contains_cp(PL_CCC_non0_non230, cp)) {
+ break;
+ }
+
+ /* Here the combining sequence continues */
+ p += UTF8SKIP(p);
+ }
+ }
+
+ /* In all other cases the lc is the same as the fold */
+ return turkic_fc(p0, e, ustrp, lenp);
+}
+
+STATIC UV
+S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e,
+ U8 * ustrp, STRLEN *lenp)
+{
+ /* Returns 0 if the upper or title-case of the input UTF-8 encoded sequence
+ * from p0..e-1 according to Turkic rules is the same as for non-Turkic.
+ * Otherwise, it returns the first code point of the Turkic upper or
+ * title-cased sequence, and the entire sequence will be stored in *ustrp.
+ * ustrp will contain *lenp bytes
+ *
+ * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
+ * I WITH DOT ABOVE form a case pair, as do 'I' and and LATIN SMALL LETTER
+ * DOTLESS I */
+
+ PERL_ARGS_ASSERT_TURKIC_UC;
+ assert(e > p);
+
+ if (*p == 'i') {
+ *lenp = 2;
+ ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+ ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+ return LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
+ }
+
+ if (memBEGINs(p, e - p, LATIN_SMALL_LETTER_DOTLESS_I_UTF8)) {
+ *lenp = 1;
+ *ustrp = 'I';
+ return 'I';
+ }
+
+ return 0;
+}
+
/* The process for changing the case is essentially the same for the four case
* change types, except there are complications for folding. Otherwise the
* difference is only which case to change to. To make sure that they all do
@@ -3742,15 +3855,24 @@ S_check_and_deprecate(pTHX_ const U8 *p,
* the input code point calculated from the UTF-8. The fold code needs to
* realize all this and take it from there.
*
+ * To deal with Turkic locales, the function specified by the parameter
+ * 'turkic' is called when appropriate.
+ *
* If you read the two macros as sequential, it's easier to understand what's
* going on. */
#define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func, \
- L1_func_extra_param) \
+ L1_func_extra_param, turkic) \
\
if (flags & (locale_flags)) { \
_CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
- /* Treat a UTF-8 locale as not being in locale at all */ \
if (IN_UTF8_CTYPE_LOCALE) { \
+ if (UNLIKELY(PL_in_utf8_turkic_locale)) { \
+ UV ret = turkic(p, e, ustrp, lenp); \
+ if (ret) return ret; \
+ } \
+ \
+ /* Otherwise, treat a UTF-8 locale as not being in locale at \
+ * all */ \
flags &= ~(locale_flags); \
} \
} \
@@ -3830,7 +3952,8 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
/* ~0 makes anything non-zero in 'flags' mean we are using locale rules */
/* 2nd char of uc(U+DF) is 'S' */
- CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S');
+ CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S',
+ turkic_uc);
CASE_CHANGE_BODY_END (~0, CALL_UPPER_CASE);
}
@@ -3863,7 +3986,8 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p,
PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
/* 2nd char of ucfirst(U+DF) is 's' */
- CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's');
+ CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's',
+ turkic_uc);
CASE_CHANGE_BODY_END (~0, CALL_TITLE_CASE);
}
@@ -3894,7 +4018,8 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p,
PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
- CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */)
+ CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */,
+ turkic_lc);
CASE_CHANGE_BODY_END (~0, CALL_LOWER_CASE)
}
@@ -3936,7 +4061,8 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
assert(p != ustrp); /* Otherwise overwrites */
CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, toFOLD_LC, _to_fold_latin1,
- ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)));
+ ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)),
+ turkic_fc);
result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL);