summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2016-11-28 09:09:23 -0700
committerKarl Williamson <khw@cpan.org>2016-11-28 17:15:24 -0700
commitafc4976faee3dbcd0f85100736d54a8694d26645 (patch)
tree03a53207363b338503970f81a52e6eb1cd3feecb
parent1e4c96768cc9fe7008eef89b69243de628c78837 (diff)
downloadperl-afc4976faee3dbcd0f85100736d54a8694d26645.tar.gz
PATCH: [perl #129953] lib/locale.t failures on FREEBSD
I thought this bug was in FREEBSD, but when I went to gather the info needed to report it to the vendor, it turned out to be a mistake I had made. The problem is basically doubly encoding into UTF-8. In order to save CPU time, in a UTF-8 locale, I had stored a string as UTF-8 encoded. This string is to be inserted into a larger string. What I neglected to consider in this situation is that not all strings in such locales need be in UTF-8. The UTF-8 encoded insert could get added to a non-UTF-8 string, and the result later was switched to UTF-8, so the inserted string's bytes were individually converted to UTF-8, effectively a second time. This is a problem only if the inserted string is different when encoded in UTF-8 than not, and for this particular usage, on most platforms it was UTF-8 invariant, so did not show up, except on those platforms where it was variant. The solution is to store the replacement as a code point, and encode it as UTF-8 only if necessary, once. This actually simplifies the code.
-rw-r--r--intrpvar.h3
-rw-r--r--locale.c67
2 files changed, 34 insertions, 36 deletions
diff --git a/intrpvar.h b/intrpvar.h
index 4243fc87aa..db6251cecc 100644
--- a/intrpvar.h
+++ b/intrpvar.h
@@ -565,7 +565,8 @@ PERLVAR(I, collation_name, char *) /* Name of current collation */
PERLVAR(I, collxfrm_base, Size_t) /* Basic overhead in *xfrm() */
PERLVARI(I, collxfrm_mult,Size_t, 2) /* Expansion factor in *xfrm() */
PERLVARI(I, collation_ix, U32, 0) /* Collation generation index */
-PERLVARA(I, strxfrm_min_char, 3, char)
+PERLVARI(I, strxfrm_min_char, U8, 0) /* Code point that sorts earliest in
+ locale */
PERLVARI(I, strxfrm_is_behaved, bool, TRUE)
/* Assume until proven otherwise that it works */
PERLVARI(I, strxfrm_max_cp, U8, 0) /* Highest collating cp in locale */
diff --git a/locale.c b/locale.c
index 8d8ed4ca0e..a93573e4c8 100644
--- a/locale.c
+++ b/locale.c
@@ -514,7 +514,7 @@ Perl_new_collate(pTHX_ const char *newcoll)
PL_collxfrm_base = 0;
PL_collxfrm_mult = 2;
PL_in_utf8_COLLATE_locale = FALSE;
- *PL_strxfrm_min_char = '\0';
+ PL_strxfrm_min_char = '\0';
PL_strxfrm_max_cp = 0;
return;
}
@@ -530,7 +530,7 @@ Perl_new_collate(pTHX_ const char *newcoll)
}
PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
- *PL_strxfrm_min_char = '\0';
+ PL_strxfrm_min_char = '\0';
PL_strxfrm_max_cp = 0;
/* A locale collation definition includes primary, secondary, tertiary,
@@ -1468,18 +1468,19 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
if (s_strlen < len) { /* Only execute if there is an embedded NUL */
char * e = s + len;
char * sans_nuls;
- STRLEN cur_min_char_len;
STRLEN sans_nuls_len;
STRLEN sans_nuls_pos;
int try_non_controls;
+ char this_replacement_char[] = "?\0"; /* Room for a two-byte string,
+ making sure 2nd byte is NUL.
+ */
+ STRLEN this_replacement_len;
+
/* If we don't know what non-NUL control character sorts lowest for
* this locale, find it */
- if (*PL_strxfrm_min_char == '\0') {
+ if (PL_strxfrm_min_char == '\0') {
int j;
-#ifdef DEBUGGING
- U8 cur_min_cp = 1; /* The code point that sorts lowest, so far */
-#endif
- char * cur_min_x = NULL; /* And its xfrm, (except it also
+ char * cur_min_x = NULL; /* The min_char's xfrm, (except it also
includes the collation index
prefixed. */
@@ -1503,29 +1504,19 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
STRLEN x_len; /* length of 'x' */
STRLEN trial_len = 1;
- /* Create a 1 byte string of the current code point, but
- * with room to be 2 bytes */
- char cur_source[] = { (char) j, '\0' , '\0' };
-
- if (PL_in_utf8_COLLATE_locale) {
- if (! try_non_controls && ! isCNTRL_L1(j)) {
- continue;
- }
+ /* Create a 1 byte string of the current code point */
+ char cur_source[] = { (char) j, '\0' };
- /* If needs to be 2 bytes, find them */
- if (! UVCHR_IS_INVARIANT(j)) {
- char * d = cur_source;
- append_utf8_from_native_byte((U8) j, (U8 **) &d);
- trial_len = 2;
- }
- }
- else if (! try_non_controls && ! isCNTRL_LC(j)) {
+ if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
+ ? ! isCNTRL_L1(j)
+ : ! isCNTRL_LC(j))
+ {
continue;
}
/* Then transform it */
x = _mem_collxfrm(cur_source, trial_len, &x_len,
- PL_in_utf8_COLLATE_locale);
+ 0 /* The string is not in UTF-8 */);
/* Ignore any character that didn't successfully transform.
* */
@@ -1539,13 +1530,8 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
|| strLT(x + COLLXFRM_HDR_LEN,
cur_min_x + COLLXFRM_HDR_LEN))
{
- PL_strxfrm_min_char[0] = cur_source[0];
- PL_strxfrm_min_char[1] = cur_source[1];
- PL_strxfrm_min_char[2] = cur_source[2];
+ PL_strxfrm_min_char = j;
cur_min_x = x;
-#ifdef DEBUGGING
- cur_min_cp = j;
-#endif
}
else {
Safefree(x);
@@ -1573,17 +1559,28 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
DEBUG_L(PerlIO_printf(Perl_debug_log,
"_mem_collxfrm: Replacing embedded NULs in locale %s with "
- "0x%02X\n", PL_collation_name, cur_min_cp));
+ "0x%02X\n", PL_collation_name, PL_strxfrm_min_char));
Safefree(cur_min_x);
} /* End of determining the character that is to replace NULs */
+
+ /* If the replacement is variant under UTF-8, it must match the
+ * UTF8-ness as the original */
+ if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_min_char) && utf8) {
+ this_replacement_char[0] = UTF8_EIGHT_BIT_HI(PL_strxfrm_min_char);
+ this_replacement_char[1] = UTF8_EIGHT_BIT_LO(PL_strxfrm_min_char);
+ this_replacement_len = 2;
+ }
+ else {
+ this_replacement_char[0] = PL_strxfrm_min_char;
+ /* this_replacement_char[1] = '\0' was done at initialization */
+ this_replacement_len = 1;
}
/* The worst case length for the replaced string would be if every
* character in it is NUL. Multiply that by the length of each
* replacement, and allow for a trailing NUL */
- cur_min_char_len = strlen(PL_strxfrm_min_char);
- sans_nuls_len = (len * cur_min_char_len) + 1;
+ sans_nuls_len = (len * this_replacement_len) + 1;
Newx(sans_nuls, sans_nuls_len, char);
*sans_nuls = '\0';
sans_nuls_pos = 0;
@@ -1597,7 +1594,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
/* Do the actual replacement */
sans_nuls_pos = my_strlcat(sans_nuls + sans_nuls_pos,
- PL_strxfrm_min_char,
+ this_replacement_char,
sans_nuls_len);
/* Move past the input NUL */