diff options
-rw-r--r-- | embed.fnc | 3 | ||||
-rw-r--r-- | embed.h | 3 | ||||
-rw-r--r-- | handy.h | 8 | ||||
-rw-r--r-- | proto.h | 9 | ||||
-rw-r--r-- | utf8.c | 144 |
5 files changed, 155 insertions, 12 deletions
@@ -1811,6 +1811,9 @@ s |UV |_to_utf8_case |const UV uv1 \ |NULLOK const unsigned int * const * const aux_tables \ |NULLOK const U8 * const aux_table_lengths \ |NN const char * const normal +s |UV |turkic_fc |NN const U8 * const p |NN const U8 * const e|NN U8* ustrp|NN STRLEN *lenp +s |UV |turkic_lc |NN const U8 * const p0|NN const U8 * const e|NN U8* ustrp|NN STRLEN *lenp +s |UV |turkic_uc |NN const U8 * const p |NN const U8 * const e|NN U8* ustrp|NN STRLEN *lenp #endif ApbmdD |UV |to_utf8_lower |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp AMp |UV |_to_utf8_lower_flags|NN const U8 *p|NULLOK const U8* e \ @@ -2081,6 +2081,9 @@ #define swash_scan_list_line(a,b,c,d,e,f,g) S_swash_scan_list_line(aTHX_ a,b,c,d,e,f,g) #define swatch_get(a,b,c) S_swatch_get(aTHX_ a,b,c) #define to_lower_latin1 S_to_lower_latin1 +#define turkic_fc(a,b,c,d) S_turkic_fc(aTHX_ a,b,c,d) +#define turkic_lc(a,b,c,d) S_turkic_lc(aTHX_ a,b,c,d) +#define turkic_uc(a,b,c,d) S_turkic_uc(aTHX_ a,b,c,d) #define unexpected_non_continuation_text(a,b,c,d) S_unexpected_non_continuation_text(aTHX_ a,b,c,d) #define warn_on_first_deprecated_use(a,b,c,d,e) S_warn_on_first_deprecated_use(aTHX_ a,b,c,d,e) # endif @@ -1542,7 +1542,7 @@ END_EXTERN_C /* These next three are also for internal core Perl use only: case-change * helper macros. The reason for using the PL_latin arrays is in case the * system function is defective; it ensures uniform results that conform to the - * Unicod standard. */ + * Unicod standard. It does not handle the anomalies in UTF-8 Turkic locales */ #define _generic_toLOWER_LC(c, function, cast) (! FITS_IN_8_BITS(c) \ ? (c) \ : (IN_UTF8_CTYPE_LOCALE) \ @@ -1553,7 +1553,8 @@ END_EXTERN_C * returns a single value, so can't adequately return the upper case of LATIN * SMALL LETTER SHARP S in a UTF-8 locale (which should be a string of two * values "SS"); instead it asserts against that under DEBUGGING, and - * otherwise returns its input */ + * otherwise returns its input. It does not handle the anomalies in UTF-8 + * Turkic locales. */ #define _generic_toUPPER_LC(c, function, cast) \ (! FITS_IN_8_BITS(c) \ ? (c) \ @@ -1571,7 +1572,8 @@ END_EXTERN_C * returns a single value, so can't adequately return the fold case of LATIN * SMALL LETTER SHARP S in a UTF-8 locale (which should be a string of two * values "ss"); instead it asserts against that under DEBUGGING, and - * otherwise returns its input */ + * otherwise returns its input. It does not handle the anomalies in UTF-8 + * Turkic locales */ #define _generic_toFOLD_LC(c, function, cast) \ ((UNLIKELY((c) == MICRO_SIGN) && IN_UTF8_CTYPE_LOCALE) \ ? GREEK_SMALL_LETTER_MU \ @@ -6243,6 +6243,15 @@ STATIC SV* S_swatch_get(pTHX_ SV* swash, UV start, UV span) STATIC U8 S_to_lower_latin1(const U8 c, U8 *p, STRLEN *lenp, const char dummy) __attribute__warn_unused_result__; +STATIC UV S_turkic_fc(pTHX_ const U8 * const p, const U8 * const e, U8* ustrp, STRLEN *lenp); +#define PERL_ARGS_ASSERT_TURKIC_FC \ + assert(p); assert(e); assert(ustrp); assert(lenp) +STATIC UV S_turkic_lc(pTHX_ const U8 * const p0, const U8 * const e, U8* ustrp, STRLEN *lenp); +#define PERL_ARGS_ASSERT_TURKIC_LC \ + assert(p0); assert(e); assert(ustrp); assert(lenp) +STATIC UV S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e, U8* ustrp, STRLEN *lenp); +#define PERL_ARGS_ASSERT_TURKIC_UC \ + assert(p); assert(e); assert(ustrp); assert(lenp) STATIC char * S_unexpected_non_continuation_text(pTHX_ const U8 * const s, STRLEN print_len, const STRLEN non_cont_byte_pos, const STRLEN expect_len) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT \ @@ -3077,10 +3077,10 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags) PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS; if (flags & FOLD_FLAGS_LOCALE) { - /* Treat a UTF-8 locale as not being in locale at all, except for - * potentially warning */ + /* Treat a non-Turkic UTF-8 locale as not being in locale at all, + * except for potentially warning */ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; - if (IN_UTF8_CTYPE_LOCALE) { + if (IN_UTF8_CTYPE_LOCALE && ! PL_in_utf8_turkic_locale) { flags &= ~FOLD_FLAGS_LOCALE; } else { @@ -3716,6 +3716,119 @@ S_check_and_deprecate(pTHX_ const U8 *p, return utf8n_flags; } +STATIC UV +S_turkic_fc(pTHX_ const U8 * const p, const U8 * const e, + U8 * ustrp, STRLEN *lenp) +{ + /* Returns 0 if the foldcase of the input UTF-8 encoded sequence from + * p0..e-1 according to Turkic rules is the same as for non-Turkic. + * Otherwise, it returns the first code point of the Turkic foldcased + * sequence, and the entire sequence will be stored in *ustrp. ustrp will + * contain *lenp bytes + * + * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER + * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER + * DOTLESS I */ + + PERL_ARGS_ASSERT_TURKIC_FC; + assert(e > p); + + if (UNLIKELY(*p == 'I')) { + *lenp = 2; + ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I); + ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I); + return LATIN_SMALL_LETTER_DOTLESS_I; + } + + if (UNLIKELY(memBEGINs(p, e - p, + LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8))) + { + *lenp = 1; + *ustrp = 'i'; + return 'i'; + } + + return 0; +} + +STATIC UV +S_turkic_lc(pTHX_ const U8 * const p0, const U8 * const e, + U8 * ustrp, STRLEN *lenp) +{ + /* Returns 0 if the lowercase of the input UTF-8 encoded sequence from + * p0..e-1 according to Turkic rules is the same as for non-Turkic. + * Otherwise, it returns the first code point of the Turkic lowercased + * sequence, and the entire sequence will be stored in *ustrp. ustrp will + * contain *lenp bytes */ + + PERL_ARGS_ASSERT_TURKIC_LC; + assert(e > p0); + + /* A 'I' requires context as to what to do */ + if (UNLIKELY(*p0 == 'I')) { + const U8 * p = p0 + 1; + + /* According to the Unicode SpecialCasing.txt file, a capital 'I' + * modified by a dot above lowercases to 'i' even in turkic locales. */ + while (p < e) { + UV cp; + + if (memBEGINs(p, e - p, COMBINING_DOT_ABOVE_UTF8)) { + ustrp[0] = 'i'; + *lenp = 1; + return 'i'; + } + + /* For the dot above to modify the 'I', it must be part of a + * combining sequence immediately following the 'I', and no other + * modifier with a ccc of 230 may intervene */ + cp = utf8_to_uvchr_buf(p, e, NULL); + if (! _invlist_contains_cp(PL_CCC_non0_non230, cp)) { + break; + } + + /* Here the combining sequence continues */ + p += UTF8SKIP(p); + } + } + + /* In all other cases the lc is the same as the fold */ + return turkic_fc(p0, e, ustrp, lenp); +} + +STATIC UV +S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e, + U8 * ustrp, STRLEN *lenp) +{ + /* Returns 0 if the upper or title-case of the input UTF-8 encoded sequence + * from p0..e-1 according to Turkic rules is the same as for non-Turkic. + * Otherwise, it returns the first code point of the Turkic upper or + * title-cased sequence, and the entire sequence will be stored in *ustrp. + * ustrp will contain *lenp bytes + * + * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER + * I WITH DOT ABOVE form a case pair, as do 'I' and and LATIN SMALL LETTER + * DOTLESS I */ + + PERL_ARGS_ASSERT_TURKIC_UC; + assert(e > p); + + if (*p == 'i') { + *lenp = 2; + ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); + ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); + return LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE; + } + + if (memBEGINs(p, e - p, LATIN_SMALL_LETTER_DOTLESS_I_UTF8)) { + *lenp = 1; + *ustrp = 'I'; + return 'I'; + } + + return 0; +} + /* The process for changing the case is essentially the same for the four case * change types, except there are complications for folding. Otherwise the * difference is only which case to change to. To make sure that they all do @@ -3742,15 +3855,24 @@ S_check_and_deprecate(pTHX_ const U8 *p, * the input code point calculated from the UTF-8. The fold code needs to * realize all this and take it from there. * + * To deal with Turkic locales, the function specified by the parameter + * 'turkic' is called when appropriate. + * * If you read the two macros as sequential, it's easier to understand what's * going on. */ #define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func, \ - L1_func_extra_param) \ + L1_func_extra_param, turkic) \ \ if (flags & (locale_flags)) { \ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \ - /* Treat a UTF-8 locale as not being in locale at all */ \ if (IN_UTF8_CTYPE_LOCALE) { \ + if (UNLIKELY(PL_in_utf8_turkic_locale)) { \ + UV ret = turkic(p, e, ustrp, lenp); \ + if (ret) return ret; \ + } \ + \ + /* Otherwise, treat a UTF-8 locale as not being in locale at \ + * all */ \ flags &= ~(locale_flags); \ } \ } \ @@ -3830,7 +3952,8 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, /* ~0 makes anything non-zero in 'flags' mean we are using locale rules */ /* 2nd char of uc(U+DF) is 'S' */ - CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S'); + CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S', + turkic_uc); CASE_CHANGE_BODY_END (~0, CALL_UPPER_CASE); } @@ -3863,7 +3986,8 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS; /* 2nd char of ucfirst(U+DF) is 's' */ - CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's'); + CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's', + turkic_uc); CASE_CHANGE_BODY_END (~0, CALL_TITLE_CASE); } @@ -3894,7 +4018,8 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p, PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS; - CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */) + CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */, + turkic_lc); CASE_CHANGE_BODY_END (~0, CALL_LOWER_CASE) } @@ -3936,7 +4061,8 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, assert(p != ustrp); /* Otherwise overwrites */ CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, toFOLD_LC, _to_fold_latin1, - ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII))); + ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)), + turkic_fc); result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL); |