locale.c: Add detection of Turkic UTF-8 locales

When switching into a new locale, after it is decided this is a UTF-8 locale, the code now also checks for if the locale is a specialized Turkic one, which has a couple of slightly modified casing change rules. If so, it sets a flag indicating this. The code that has been added in previous commits in this series check if that flag is set when they are actually paying attention to the background locale, and if so behave according to Unicode Turkic rules.
author: Karl Williamson <khw@cpan.org> 2019-02-04 18:58:26 -0700
committer: Karl Williamson <khw@cpan.org> 2019-02-05 11:44:29 -0700
commit: 30d8090de81085bd3dff00c83a7ab6d3ff8dfc8d (patch)
tree: f3512f86d360f5c2fbaf05b88627dc2ecfafc20f /locale.c
parent: 26be5fe6b967cd228768870a4b6138947d418d39 (diff)
download: perl-30d8090de81085bd3dff00c83a7ab6d3ff8dfc8d.tar.gz
1 files changed, 24 insertions, 1 deletions
diff --git a/locale.c b/locale.c
index 383b2137c0..07e5525c10 100644
--- a/locale.c
+++ b/locale.c
@@ -1507,6 +1507,7 @@ S_new_ctype(pTHX_ const char *newctype)
 
     /* Don't check for problems if we are suppressing the warnings */
     bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST);
+    bool maybe_utf8_turkic = FALSE;
 
     PERL_ARGS_ASSERT_NEW_CTYPE;
 
@@ -1523,6 +1524,14 @@ S_new_ctype(pTHX_ const char *newctype)
      * handle this specially because of the three problematic code points */
     if (PL_in_utf8_CTYPE_locale) {
         Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
+
+        /* UTF-8 locales can have special handling for 'I' and 'i' if they are
+         * Turkic.  Make sure these two are the only anomalies.  (We don't use
+         * towupper and towlower because they aren't in C89.) */
+        if (toupper('i') == 'i' && tolower('I') == 'I') {
+            check_for_problems = TRUE;
+            maybe_utf8_turkic = TRUE;
+        }
     }
 
     /* We don't populate the other lists if a UTF-8 locale, but do check that
@@ -1668,7 +1677,18 @@ S_new_ctype(pTHX_ const char *newctype)
             }
         }
 
+        if (bad_count == 2 && maybe_utf8_turkic) {
+            bad_count = 0;
+            *bad_chars_list = '\0';
+            PL_fold_locale['I'] = 'I';
+            PL_fold_locale['i'] = 'i';
+            PL_in_utf8_turkic_locale = TRUE;
+            DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s is turkic\n",
+                                                 __FILE__, __LINE__, newctype));
+        }
+        else {
             PL_in_utf8_turkic_locale = FALSE;
+        }
 
 #  ifdef MB_CUR_MAX
 
@@ -1695,7 +1715,10 @@ S_new_ctype(pTHX_ const char *newctype)
 
 #  endif
 
-        if (UNLIKELY(bad_count) || UNLIKELY(multi_byte_locale)) {
+        /* If we found problems and we want them output, do so */
+        if (   (UNLIKELY(bad_count) || UNLIKELY(multi_byte_locale))
+            && (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST)))
+        {
             if (UNLIKELY(bad_count) && PL_in_utf8_CTYPE_locale) {
                 PL_warn_locale = Perl_newSVpvf(aTHX_
                      "Locale '%s' contains (at least) the following characters"
author	Karl Williamson <khw@cpan.org>	2019-02-04 18:58:26 -0700
committer	Karl Williamson <khw@cpan.org>	2019-02-05 11:44:29 -0700
commit	30d8090de81085bd3dff00c83a7ab6d3ff8dfc8d (patch)
tree	f3512f86d360f5c2fbaf05b88627dc2ecfafc20f /locale.c
parent	26be5fe6b967cd228768870a4b6138947d418d39 (diff)
download	perl-30d8090de81085bd3dff00c83a7ab6d3ff8dfc8d.tar.gz