PATCH: [perl #129953] lib/locale.t failures on FREEBSD

I thought this bug was in FREEBSD, but when I went to gather the info needed to report it to the vendor, it turned out to be a mistake I had made. The problem is basically doubly encoding into UTF-8. In order to save CPU time, in a UTF-8 locale, I had stored a string as UTF-8 encoded. This string is to be inserted into a larger string. What I neglected to consider in this situation is that not all strings in such locales need be in UTF-8. The UTF-8 encoded insert could get added to a non-UTF-8 string, and the result later was switched to UTF-8, so the inserted string's bytes were individually converted to UTF-8, effectively a second time. This is a problem only if the inserted string is different when encoded in UTF-8 than not, and for this particular usage, on most platforms it was UTF-8 invariant, so did not show up, except on those platforms where it was variant. The solution is to store the replacement as a code point, and encode it as UTF-8 only if necessary, once. This actually simplifies the code.
author: Karl Williamson <khw@cpan.org> 2016-11-28 09:09:23 -0700
committer: Karl Williamson <khw@cpan.org> 2016-11-28 17:15:24 -0700
commit: afc4976faee3dbcd0f85100736d54a8694d26645 (patch)
tree: 03a53207363b338503970f81a52e6eb1cd3feecb
parent: 1e4c96768cc9fe7008eef89b69243de628c78837 (diff)
download: perl-afc4976faee3dbcd0f85100736d54a8694d26645.tar.gz
2 files changed, 34 insertions, 36 deletions
diff --git a/intrpvar.h b/intrpvar.h
index 4243fc87aa..db6251cecc 100644
--- a/intrpvar.h
+++ b/intrpvar.h
@@ -565,7 +565,8 @@ PERLVAR(I, collation_name, char *)	/* Name of current collation */
 PERLVAR(I, collxfrm_base, Size_t)	/* Basic overhead in *xfrm() */
 PERLVARI(I, collxfrm_mult,Size_t, 2)	/* Expansion factor in *xfrm() */
 PERLVARI(I, collation_ix, U32,	0)	/* Collation generation index */
-PERLVARA(I, strxfrm_min_char, 3, char)
+PERLVARI(I, strxfrm_min_char, U8, 0)    /* Code point that sorts earliest in
+                                           locale */
 PERLVARI(I, strxfrm_is_behaved, bool, TRUE)
                             /* Assume until proven otherwise that it works */
 PERLVARI(I, strxfrm_max_cp, U8, 0)      /* Highest collating cp in locale */
diff --git a/locale.c b/locale.c
index 8d8ed4ca0e..a93573e4c8 100644
--- a/locale.c
+++ b/locale.c
@@ -514,7 +514,7 @@ Perl_new_collate(pTHX_ const char *newcoll)
 	PL_collxfrm_base = 0;
 	PL_collxfrm_mult = 2;
         PL_in_utf8_COLLATE_locale = FALSE;
-        *PL_strxfrm_min_char = '\0';
+        PL_strxfrm_min_char = '\0';
         PL_strxfrm_max_cp = 0;
 	return;
     }
@@ -530,7 +530,7 @@ Perl_new_collate(pTHX_ const char *newcoll)
         }
 
         PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
-        *PL_strxfrm_min_char = '\0';
+        PL_strxfrm_min_char = '\0';
         PL_strxfrm_max_cp = 0;
 
         /* A locale collation definition includes primary, secondary, tertiary,
@@ -1468,18 +1468,19 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
     if (s_strlen < len) {   /* Only execute if there is an embedded NUL */
         char * e = s + len;
         char * sans_nuls;
-        STRLEN cur_min_char_len;
         STRLEN sans_nuls_len;
         STRLEN sans_nuls_pos;
         int try_non_controls;
+        char this_replacement_char[] = "?\0";   /* Room for a two-byte string,
+                                                   making sure 2nd byte is NUL.
+                                                 */
+        STRLEN this_replacement_len;
+
         /* If we don't know what non-NUL control character sorts lowest for
          * this locale, find it */
-        if (*PL_strxfrm_min_char == '\0') {
+        if (PL_strxfrm_min_char == '\0') {
             int j;
-#ifdef DEBUGGING
-            U8     cur_min_cp = 1; /* The code point that sorts lowest, so far */
-#endif
-            char * cur_min_x = NULL;    /* And its xfrm, (except it also
+            char * cur_min_x = NULL;    /* The min_char's xfrm, (except it also
                                            includes the collation index
                                            prefixed. */
 
@@ -1503,29 +1504,19 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
                     STRLEN x_len;   /* length of 'x' */
                     STRLEN trial_len = 1;
 
-                    /* Create a 1 byte string of the current code point, but
-                     * with room to be 2 bytes */
-                    char cur_source[] = { (char) j, '\0' , '\0' };
-
-                    if (PL_in_utf8_COLLATE_locale) {
-                        if (! try_non_controls && ! isCNTRL_L1(j)) {
-                            continue;
-                        }
+                    /* Create a 1 byte string of the current code point */
+                    char cur_source[] = { (char) j, '\0' };
 
-                        /* If needs to be 2 bytes, find them */
-                        if (! UVCHR_IS_INVARIANT(j)) {
-                            char * d = cur_source;
-                            append_utf8_from_native_byte((U8) j, (U8 **) &d);
-                            trial_len = 2;
-                        }
-                    }
-                    else if (! try_non_controls && ! isCNTRL_LC(j)) {
+                    if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
+                                               ? ! isCNTRL_L1(j)
+                                               : ! isCNTRL_LC(j))
+                    {
                         continue;
                     }
 
                     /* Then transform it */
                     x = _mem_collxfrm(cur_source, trial_len, &x_len,
-                                      PL_in_utf8_COLLATE_locale);
+                                      0 /* The string is not in UTF-8 */);
 
                     /* Ignore any character that didn't successfully transform.
                      * */
@@ -1539,13 +1530,8 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
                         || strLT(x         + COLLXFRM_HDR_LEN,
                                  cur_min_x + COLLXFRM_HDR_LEN))
                     {
-                        PL_strxfrm_min_char[0] = cur_source[0];
-                        PL_strxfrm_min_char[1] = cur_source[1];
-                        PL_strxfrm_min_char[2] = cur_source[2];
+                        PL_strxfrm_min_char = j;
                         cur_min_x = x;
-#ifdef DEBUGGING
-                        cur_min_cp = j;
-#endif
                     }
                     else {
                         Safefree(x);
@@ -1573,17 +1559,28 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
 
             DEBUG_L(PerlIO_printf(Perl_debug_log,
                     "_mem_collxfrm: Replacing embedded NULs in locale %s with "
-                    "0x%02X\n", PL_collation_name, cur_min_cp));
+                    "0x%02X\n", PL_collation_name, PL_strxfrm_min_char));
 
             Safefree(cur_min_x);
         } /* End of determining the character that is to replace NULs */
+
+        /* If the replacement is variant under UTF-8, it must match the
+         * UTF8-ness as the original */
+        if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_min_char) && utf8) {
+            this_replacement_char[0] = UTF8_EIGHT_BIT_HI(PL_strxfrm_min_char);
+            this_replacement_char[1] = UTF8_EIGHT_BIT_LO(PL_strxfrm_min_char);
+            this_replacement_len = 2;
+        }
+        else {
+            this_replacement_char[0] = PL_strxfrm_min_char;
+            /* this_replacement_char[1] = '\0' was done at initialization */
+            this_replacement_len = 1;
         }
 
         /* The worst case length for the replaced string would be if every
          * character in it is NUL.  Multiply that by the length of each
          * replacement, and allow for a trailing NUL */
-        cur_min_char_len = strlen(PL_strxfrm_min_char);
-        sans_nuls_len = (len * cur_min_char_len) + 1;
+        sans_nuls_len = (len * this_replacement_len) + 1;
         Newx(sans_nuls, sans_nuls_len, char);
         *sans_nuls = '\0';
         sans_nuls_pos = 0;
@@ -1597,7 +1594,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
 
             /* Do the actual replacement */
             sans_nuls_pos = my_strlcat(sans_nuls + sans_nuls_pos,
-                                       PL_strxfrm_min_char,
+                                       this_replacement_char,
                                        sans_nuls_len);
 
             /* Move past the input NUL */
author	Karl Williamson <khw@cpan.org>	2016-11-28 09:09:23 -0700
committer	Karl Williamson <khw@cpan.org>	2016-11-28 17:15:24 -0700
commit	afc4976faee3dbcd0f85100736d54a8694d26645 (patch)
tree	03a53207363b338503970f81a52e6eb1cd3feecb
parent	1e4c96768cc9fe7008eef89b69243de628c78837 (diff)
download	perl-afc4976faee3dbcd0f85100736d54a8694d26645.tar.gz