summaryrefslogtreecommitdiff
path: root/locale.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2022-11-28 05:37:04 -0700
committerKarl Williamson <khw@cpan.org>2022-12-07 09:13:37 -0700
commit6af0187d84aff4f4fd1b8c5039418e86d16cfafd (patch)
treea2dea82a5d6c4ac518def2cd29a0d4b744ac7742 /locale.c
parent2fc8f382932e3d44b386b00160160e845822903f (diff)
downloadperl-6af0187d84aff4f4fd1b8c5039418e86d16cfafd.tar.gz
locale.c: Rewrite localeconv() handling
localeconv() returns a structure contaiing fields that are associated with two different categories: LC_NUMERIC and LC_MONETARY. Perl via POSIX::localeconv() reutrns a hash containing all the fields. Testing on Windows showed that if LC_CTYPE is not the same locale as LC_MONETARY for the monetary fields, or isn't the same as LC_NUMERIC for the numeric ones, mojibake can result. The solution to similar situations elsewhere in the code is to toggle LC_CTYPE into being the same locale as the one for the returned fields. But those situations only have a single locale that LC_CTYPE has to match, so it doesn't work here when LC_NUMERIC and LC_MONETARY are different locales. Unlike Schrödinger's cat, LC_CTYPE has to be one or the other, not both at the same time. The previous implementation did not consider this possibility, and wasn't easily changeable to work. Therefore, this rewrites a bunch of it. The solution used is to call localeconv() twice when the LC_NUMERIC locale and the LC_MONETARY locale don't match (with LC_CTYPE toggled to the corresponding one each time). (Only one call is made if the two categories have the same locale.) This one vs two complicated the code, but I thought it was worth it given that the one call is the most likely case. Another complication is that on platforms that lack nl_langinfo(), (Windows, for example), localeconv() is used to emulate portions of it. Previously there was a separate function to handle this, using an SV() cast as an HV() to avoid using a hash that wasn't actually necessary. That proved to lead to extra duplicated code under the new scheme, so that function was collapsed into a single one and a real hash is used in all circumstances, but is only populated with the one or two fields needed for the emulation. The only part of this commit that I thought could be split off from the rest concerns the fact that localeconv()'s return is not thread-safe, and so must be copied to a safe place (the hash) while in a critical section, locking out all other threads. Before this commit, that copying was accompanied by determining if each string field needed to be marked as UTF-8. That determination isn't necessarily trivial, so should really not be in the critical section. This commit does that. And, with some effort, that part could have been split into a separate commit. but I didn't think it was worth the effort.
Diffstat (limited to 'locale.c')
-rw-r--r--locale.c884
1 files changed, 539 insertions, 345 deletions
diff --git a/locale.c b/locale.c
index b3b4d74b6a..b9d2b7edb4 100644
--- a/locale.c
+++ b/locale.c
@@ -3193,8 +3193,7 @@ HV *
Perl_localeconv(pTHX)
{
-#if ! defined(HAS_LOCALECONV) \
- || (! defined(USE_LOCALE_MONETARY) && ! defined(USE_LOCALE_NUMERIC))
+#if ! defined(HAS_LOCALECONV)
return newHV();
@@ -3206,141 +3205,453 @@ Perl_localeconv(pTHX)
}
-#if defined(HAS_LOCALECONV) \
- && (defined(USE_LOCALE_MONETARY) || defined(USE_LOCALE_NUMERIC))
+#if defined(HAS_LOCALECONV)
HV *
S_my_localeconv(pTHX_ const int item)
{
- HV * retval;
- locale_utf8ness_t numeric_locale_is_utf8 = LOCALE_UTF8NESS_UNKNOWN;
- locale_utf8ness_t monetary_locale_is_utf8 = LOCALE_UTF8NESS_UNKNOWN;
- HV * (*copy_localeconv)(pTHX_ const struct lconv *,
- const int,
- const locale_utf8ness_t,
- const locale_utf8ness_t);
-
- /* A thread-safe locale_conv(). The locking mechanisms vary greatly
- * depending on platform capabilities. They all share this common set up
- * code for the function, and then conditional compilations choose one of
- * several terminations.
+ PERL_ARGS_ASSERT_MY_LOCALECONV;
+
+ /* This returns a mortalized hash containing all or one of the elements
+ * returned by localeconv(). It is used by Perl_localeconv() and
+ * POSIX::localeconv() and is thread-safe.
*
* There are two use cases:
- * 1) Called from POSIX::locale_conv(). This returns lconv() copied to
- * a hash, based on the current underlying locale.
+ * 1) Called from POSIX::locale_conv(). This returns the lconv structure
+ * copied to a hash, based on the current underlying locales for
+ * LC_NUMERIC and LC_MONETARY. An input item==0 signifies this case, or
+ * on many platforms it is the only use case compiled.
* 2) Certain items that nl_langinfo() provides are also derivable from
* the return of localeconv(). Windows notably doesn't have
* nl_langinfo(), so on that, and actually any platform lacking it,
- * my_localeconv() is used to emulate it for those particular items.
- * The code to do this is compiled only on such platforms. Rather than
- * going to the expense of creating a full hash when only one item is
- * needed, just the desired item is returned, in an SV cast to an HV.
+ * my_localeconv() is used also to emulate it for those particular
+ * items. The code to do this is compiled only on such platforms.
+ * Rather than going to the expense of creating a full hash when only
+ * one item is needed, the returned hash has just the desired item in
+ * it.
+ *
+ * To access all the localeconv() struct lconv fields, there is a data
+ * structure that contains every commonly documented field in it. (Maybe
+ * some minority platforms have extra fields. Those could be added here
+ * without harm; they would just be ignored on platforms lacking them.)
*
- * There is a helper function to accomplish each of the two tasks. The
- * function pointer just below is set to the appropriate one, and is called
- * from each of the various implementations, in the middle of whatever
- * necessary locking/locale swapping have been done. */
+ * Our structure is compiled to make looping through the fields easier by
+ * pointing each name to its value's offset within lconv, e.g.,
+ { "thousands_sep", STRUCT_OFFSET(struct lconv, thousands_sep) }
+ */
+# define LCONV_ENTRY(name) \
+ {STRINGIFY(name), STRUCT_OFFSET(struct lconv, name)}
+
+ /* These synonyms are just for clarity, and to make it easier in case
+ * something needs to change in the future */
+# define LCONV_NUMERIC_ENTRY(name) LCONV_ENTRY(name)
+# define LCONV_MONETARY_ENTRY(name) LCONV_ENTRY(name)
+
+ /* There are just a few fields for NUMERIC strings */
+ const lconv_offset_t lconv_numeric_strings[] = {
+# ifndef NO_LOCALECONV_GROUPING
+ LCONV_NUMERIC_ENTRY(grouping),
+# endif
+ LCONV_NUMERIC_ENTRY(thousands_sep),
+ LCONV_NUMERIC_ENTRY(decimal_point),
+ {NULL, 0}
+ };
-# ifdef HAS_SOME_LANGINFO
+ /* When used to implement nl_langinfo(), we save time by only populating
+ * the hash with the field(s) needed. Thus we would need a data structure
+ * of just:
+ * LCONV_NUMERIC_ENTRY(decimal_point),
+ * {NULL, 0}
+ *
+ * By placing the decimal_point field last in the full structure, we can
+ * use just the tail for this bit of it, saving space. This macro yields
+ * the address of the sub structure. */
+# define DECIMAL_POINT_ADDRESS \
+ &lconv_numeric_strings[(C_ARRAY_LENGTH(lconv_numeric_strings) - 2)]
+
+ /* And the MONETARY string fields */
+ const lconv_offset_t lconv_monetary_strings[] = {
+ LCONV_MONETARY_ENTRY(int_curr_symbol),
+ LCONV_MONETARY_ENTRY(mon_decimal_point),
+# ifndef NO_LOCALECONV_MON_THOUSANDS_SEP
+ LCONV_MONETARY_ENTRY(mon_thousands_sep),
+# endif
+# ifndef NO_LOCALECONV_MON_GROUPING
+ LCONV_MONETARY_ENTRY(mon_grouping),
+# endif
+ LCONV_MONETARY_ENTRY(positive_sign),
+ LCONV_MONETARY_ENTRY(negative_sign),
+ LCONV_MONETARY_ENTRY(currency_symbol),
+ {NULL, 0}
+ };
- PERL_UNUSED_ARG(item);
+ /* Like above, this field being last can be used as a sub structure */
+# define CURRENCY_SYMBOL_ADDRESS \
+ &lconv_monetary_strings[(C_ARRAY_LENGTH(lconv_monetary_strings) - 2)]
- const bool is_localeconv_call = true;
+ /* Finally there are integer fields, all are for monetary purposes */
+ const lconv_offset_t lconv_integers[] = {
+ LCONV_ENTRY(int_frac_digits),
+ LCONV_ENTRY(frac_digits),
+ LCONV_ENTRY(p_sep_by_space),
+ LCONV_ENTRY(n_cs_precedes),
+ LCONV_ENTRY(n_sep_by_space),
+ LCONV_ENTRY(p_sign_posn),
+ LCONV_ENTRY(n_sign_posn),
+# ifdef HAS_LC_MONETARY_2008
+ LCONV_ENTRY(int_p_cs_precedes),
+ LCONV_ENTRY(int_p_sep_by_space),
+ LCONV_ENTRY(int_n_cs_precedes),
+ LCONV_ENTRY(int_n_sep_by_space),
+ LCONV_ENTRY(int_p_sign_posn),
+ LCONV_ENTRY(int_n_sign_posn),
+# endif
+ LCONV_ENTRY(p_cs_precedes),
+ {NULL, 0}
+ };
-# else
+ /* Like above, this field being last can be used as a sub structure */
+# define P_CS_PRECEDES_ADDRESS \
+ &lconv_integers[(C_ARRAY_LENGTH(lconv_integers) - 2)]
- /* Note we use this sentinel; this works because this only gets compiled
- * when our perl_langinfo.h is used, and that uses negative numbers for all
- * the items */
- const bool is_localeconv_call = (item == 0);
+ /* If we aren't paying attention to a given category, use LC_CTYPE instead;
+ * If not paying attention to that either, the code below should end up not
+ * using this. Make sure that things blow up if that avoidance gets lost,
+ * by setting the category to -1 */
+ unsigned int numeric_index;
+ unsigned int monetary_index;
+# ifdef USE_LOCALE_NUMERIC
+ numeric_index = LC_NUMERIC_INDEX_;
+# elif defined(USE_LOCALE_CTYPE)
+ numeric_index = LC_CTYPE_INDEX_;
+# else
+ numeric_index = (unsigned) -1;
# endif
+# ifdef USE_LOCALE_MONETARY
+ monetary_index = LC_MONETARY_INDEX_;
+# elif defined(USE_LOCALE_CTYPE)
+ monetary_index = LC_CTYPE_INDEX_;
+# else
+ monetary_index = (unsigned) -1;
+# endif
+
+ /* Some platforms, for correct non-mojibake results, require LC_CTYPE's
+ * locale to match LC_NUMERIC's for the numeric fields, and LC_MONETARY's
+ * for the monetary ones. What happens if LC_NUMERIC and LC_MONETARY
+ * aren't compatible? Wrong results. To avoid that, we call localeconv()
+ * twice, once for each locale, setting LC_CTYPE to match the category.
+ * But if the locales of both categories are the same, there is no need for
+ * a second call. Assume this is the case unless overridden below */
+ bool requires_2nd_localeconv = false;
+
+ /* The actual hash populating is done by S_populate_hash_from_localeconv().
+ * It gets passed an array of length two containing the data structure it
+ * is supposed to use to get the key names to fill the hash with. One
+ * element is alwasy for the NUMERIC strings (or NULL if none to use), and
+ * the other element similarly for the MONETARY ones. */
+# define NUMERIC_STRING_OFFSET 0
+# define MONETARY_STRING_OFFSET 1
+ const lconv_offset_t * strings[2] = { NULL, NULL };
+
+ /* This is a mask, with one bit to tell S_populate_hash_from_localeconv to
+ * populate the NUMERIC items; another bit for the MONETARY ones. This way
+ * it can choose which (or both) to populate from */
+ U32 index_bits = 0;
+
+ /* This converts from a locale index to its bit position in the above mask.
+ * */
+# define INDEX_TO_BIT(i) (1 << (i))
+
+ /* The two categories can have disparate locales. Initialize them to C and
+ * override later whichever one(s) we pay attention to */
+ const char * numeric_locale = "C";
+ const char * monetary_locale = "C";
+
+ /* This will be either 'numeric_locale' or 'monetary_locale' depending on
+ * what we are working on at the moment */
+ const char * locale;
- if (is_localeconv_call) {
- copy_localeconv = S_populate_localeconv;
+ /* The LC_MONETARY category also has some integer-valued fields, whose
+ * information is kept in a separate list */
+ const lconv_offset_t * integers;
+
+# ifdef HAS_SOME_LANGINFO
+
+ /* If the only use-case for this is the full localeconv(), the 'item'
+ * parameter is ignored. */
+ PERL_UNUSED_ARG(item);
+
+# else
+
+ /* This only gets compiled for the use-case of using localeconv() to
+ * emulate an nl_langinfo() missing from the platform.
+ *
+ * We need this substructure to only return this field for the THOUSEP
+ * item. The other items also need substructures, but they were handled
+ * above by placing the substructure's item at the end of the full one, so
+ * the data structure could do double duty. However, both this and
+ * RADIXCHAR would need to be in the final position of the same full
+ * structure; an impossibility. So make this into a separate structure */
+ const lconv_offset_t thousands_sep_string[] = {
+ LCONV_NUMERIC_ENTRY(thousands_sep),
+ {NULL, 0}
+ };
+
+ /* End of all the initialization of datastructures. Now for actual code.
+ *
+ * Without nl_langinfo(), the call to my_localeconv() could be for just one
+ * of the following 3 items to emulate nl_langinfo(). This is compiled
+ * only when using perl_langinfo.h, which we control, and it has been
+ * constructed so that no item is numbered 0.
+ *
+ * For each, setup the appropriate parameters for the call below to
+ * S_populate_hash_from_localeconv() */
+ if (item != 0) switch (item) {
+ default:
+ locale_panic_(Perl_form(aTHX_
+ "Unexpected item passed to my_localeconv: %d", item));
+ break;
# ifdef USE_LOCALE_NUMERIC
- /* Get the UTF8ness of the locales now to avoid repeating this for each
- * string returned by localeconv() */
- numeric_locale_is_utf8 = (is_locale_utf8(PL_numeric_name))
- ? LOCALE_IS_UTF8
- : LOCALE_NOT_UTF8;
+ case RADIXCHAR:
+ locale = numeric_locale = PL_numeric_name;
+ index_bits = INDEX_TO_BIT(LC_NUMERIC_INDEX_);
+ strings[NUMERIC_STRING_OFFSET] = DECIMAL_POINT_ADDRESS;
+ integers = NULL;
+ break;
+
+ case THOUSEP:
+ index_bits = INDEX_TO_BIT(LC_NUMERIC_INDEX_);
+ locale = numeric_locale = PL_numeric_name;
+ strings[NUMERIC_STRING_OFFSET] = thousands_sep_string;
+ integers = NULL;
+ break;
# endif
# ifdef USE_LOCALE_MONETARY
- monetary_locale_is_utf8 = (is_locale_utf8(querylocale_c(LC_MONETARY)))
- ? LOCALE_IS_UTF8
- : LOCALE_NOT_UTF8;
+ case CRNCYSTR:
+ index_bits = INDEX_TO_BIT(LC_MONETARY_INDEX_);
+ locale = monetary_locale = querylocale_i(LC_MONETARY_INDEX_);
+
+ /* This item needs the values for both the currency symbol, and another
+ * one used to construct the nl_langino()-compatible return */
+ strings[MONETARY_STRING_OFFSET] = CURRENCY_SYMBOL_ADDRESS;
+ integers = P_CS_PRECEDES_ADDRESS;
+ break;
+
+# endif
+
+ } /* End of switch() */
+
+ else /* End of for just one item to emulate nl_langinfo() */
# endif
- }
+ { /* Here, the call is for all of localeconv(). It has a bunch of
+ * items. As in the individual item case, set up the parameters for
+ * S_populate_hash_from_localeconv(); */
-# ifndef HAS_SOME_LANGINFO
+# ifdef USE_LOCALE_NUMERIC
+ numeric_locale = PL_numeric_name;
+# elif defined(USE_LOCALE_CTYPE)
+ numeric_locale = querylocale_i(numeric_index);
+# endif
+# if defined(USE_LOCALE_MONETARY) || defined(USE_LOCALE_CTYPE)
+ monetary_locale = querylocale_i(monetary_index);
+# endif
- else {
- copy_localeconv = S_get_nl_item_from_localeconv;
- }
+ /* The first call to S_populate_hash_from_localeconv() will be for the
+ * MONETARY values */
+ index_bits = INDEX_TO_BIT(monetary_index);
+ locale = monetary_locale;
+
+ /* And if the locales for the two categories are the same, we can also
+ * do the NUMERIC values in the same call */
+ if (strEQ(numeric_locale, monetary_locale)) {
+ index_bits |= INDEX_TO_BIT(numeric_index);
+ }
+ else {
+ requires_2nd_localeconv = true;
+ }
+
+ /* We always pass both sets of strings. 'index_bits' tells
+ * S_populate_hash_from_localeconv which to actually look at */
+ strings[NUMERIC_STRING_OFFSET] = lconv_numeric_strings;
+ strings[MONETARY_STRING_OFFSET] = lconv_monetary_strings;
+
+ /* And pass the integer values to populate; again 'index_bits' will
+ * say to use them or not */
+ integers = lconv_integers;
+
+ } /* End of call is for localeconv() */
+
+ /* The code above has determined the parameters to
+ S_populate_hash_from_localeconv() for both cases of an individual item
+ and for the entire structure. Below is code common to both */
+
+ HV * hv = newHV(); /* The returned hash, initially empty */
+ sv_2mortal((SV*)hv);
+
+ /* Call localeconv() and copy its results into the hash. All the
+ * parameters have been initialized above */
+ populate_hash_from_localeconv(hv,
+ locale,
+ index_bits,
+ strings,
+ integers
+ );
+
+ /* The above call may have done all the hash fields, but not always, as
+ * already explained. If we need a second call it is always for the
+ * NUMERIC fields */
+ if (requires_2nd_localeconv) {
+ populate_hash_from_localeconv(hv,
+ numeric_locale,
+ INDEX_TO_BIT(numeric_index),
+ strings,
+ NULL /* There are No NUMERIC integer
+ fields */
+ );
+ }
+
+ /* Here, the hash has been completely populated.
+ *
+ * Now go through all the string items and see if they should be marked as
+ * UTF-8 or not. This would have been more convenient and faster to do
+ * while populating the hash in the first place, but that operation has to
+ * be done within a critical section, keeping other threads from executing,
+ * so only the minimal amount of work necessary is done at that time.
+ *
+ * XXX On unthreaded perls, and on platforms where localeconv (or
+ * localeconv_l if present) this code could be #ifdef'd out, and the
+ * UTF8ness determined at hash population time, at an extra maintenance
+ * cost which khw doesn't think is worth it
+ */
+ for (unsigned int i = 0; i < 2; i++) { /* Try both types of strings */
+ if (! strings[i]) { /* Skip if no strings of this type */
+ continue;
+ }
+
+ locale = (i == NUMERIC_STRING_OFFSET)
+ ? numeric_locale
+ : monetary_locale;
+
+ locale_utf8ness_t locale_is_utf8 = LOCALE_UTF8NESS_UNKNOWN;
+
+# ifdef HAS_RELIABLE_UTF8NESS_DETERMINATION
+
+ /* It saves time in the loop below to have predetermined the UTF8ness
+ * of the locale. But only do so if the platform reliably has this
+ * information; otherwise to do it, this could recurse indefinitely.
+ *
+ * When we don't do it here, it will be done on a per-element basis in
+ * the loop. The per-element check is intelligent enough to not
+ * recurse */
+
+ locale_is_utf8 = (is_locale_utf8(locale))
+ ? LOCALE_IS_UTF8
+ : LOCALE_NOT_UTF8;
+
+ if (locale_is_utf8 == LOCALE_NOT_UTF8) {
+ continue; /* No string can be UTF-8 if the locale isn't */
+ }
# endif
- PERL_ARGS_ASSERT_MY_LOCALECONV;
-/*--------------------------------------------------------------------------*/
-/* Here, we are done with the common beginning of all the implementations of
- * my_localeconv(). Below are the two terminations of the function (except
- * the closing '}'. They are separated out because the preprocessor directives
- * were making the simple logic hard to follow. Each implementation ends with
- * the same few lines. khw decided to keep those separate because he thought
- * it was clearer to the reader.
- *--------------------------------------------------------------------------*/
-# if ! defined(TS_W32_BROKEN_LOCALECONV) /* Regular lconv() */
-
- /* There are so many locks because localeconv() deals with two
- * categories, and returns in a single global static buffer. Some
- * locks might be no-ops on this platform, but not others. We need to
- * lock if any one isn't a no-op. */
+ /* Examine each string */
+ while (1) {
+ const char * name = strings[i]->name;
-# ifdef USE_LOCALE_NUMERIC
+ if (! name) { /* Reached the end */
+ break;
+ }
- LC_NUMERIC_LOCK(0);
- const char * orig_switched_locale = NULL;
+ /* 'value' will contain the string that may need to be marked as
+ * UTF-8 */
+ SV ** value = hv_fetch(hv, name, strlen(name), true);
+ if (! value) {
+ continue;
+ }
- /* When called internally, are already switched into the proper numeric
- * locale; otherwise must toggle to it */
- if (is_localeconv_call) {
- orig_switched_locale = toggle_locale_c(LC_NUMERIC, PL_numeric_name);
- }
+ /* Determine if the string should be marked as UTF-8. */
+ if (UTF8NESS_YES == (get_locale_string_utf8ness_i(SvPVX(*value),
+ locale_is_utf8,
+ NULL, 0)))
+ {
+ SvUTF8_on(*value);
+ }
-# endif
+ strings[i]++; /* Iterate */
+ }
+ } /* End of fixing up UTF8ness */
- gwLOCALE_LOCK;
- retval = copy_localeconv(aTHX_ localeconv(),
- item,
- numeric_locale_is_utf8,
- monetary_locale_is_utf8);
- gwLOCALE_UNLOCK;
+ return hv;
+}
-# ifdef USE_LOCALE_NUMERIC
+STATIC void
+S_populate_hash_from_localeconv(pTHX_ HV * hv,
+
+ /* Switch to this locale to run
+ * localeconv() from */
+ const char * locale,
+
+ /* bit mask of which categories to
+ * populate */
+ const U32 which_mask,
+
+ /* strings[0] points the the numeric
+ * string fields; [1] to the monetary */
+ const lconv_offset_t * strings[2],
+
+ /* And to the monetary integer fields */
+ const lconv_offset_t * integers)
+{
+ PERL_ARGS_ASSERT_POPULATE_HASH_FROM_LOCALECONV;
+ PERL_UNUSED_ARG(which_mask); /* Some configurations don't use this;
+ complicated to figure out which */
+
+ /* Run localeconv() and copy some or all of its results to the input 'hv'
+ * hash. Most localeconv() implementations return the values in a global
+ * static buffer, so the operation must be performed in a critical section,
+ * ending only after the copy is completed. There are so many locks
+ * because localeconv() deals with two categories, and returns in a single
+ * global static buffer. Some locks might be no-ops on this platform, but
+ * not others. We need to lock if any one isn't a no-op. */
+
+# ifdef USE_LOCALE_CTYPE
+
+ /* Some platforms require LC_CTYPE to be congruent with the category we are
+ * looking for */
+ const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
+
+# endif
+# ifdef USE_LOCALE_NUMERIC
- if (orig_switched_locale) {
- restore_toggled_locale_c(LC_NUMERIC, orig_switched_locale);
+ /* We need to toggle to the underlying NUMERIC locale if we are getting
+ * NUMERIC strings */
+ const char * orig_NUMERIC_locale = NULL;
+ if (which_mask & INDEX_TO_BIT(LC_NUMERIC_INDEX_)) {
+ LC_NUMERIC_LOCK(0);
+ orig_NUMERIC_locale = toggle_locale_i(LC_NUMERIC_INDEX_, locale);
}
- LC_NUMERIC_UNLOCK;
# endif
- return retval;
+ /* Finally ready to do the actual localeconv(). Lock to prevent other
+ * accesses until we have made a copy of its returned static buffer */
+ gwLOCALE_LOCK;
-/*--------------------------------------------------------------------------*/
-# else /* defined(TS_W32_BROKEN_LOCALECONV) */
+# ifdef TS_W32_BROKEN_LOCALECONV
- /* Last is a workaround for the broken localeconv() on Windows with
- * thread-safe locales prior to VS 15. It looks at the global locale
- * instead of the thread one. As a work-around, we toggle to the global
- * locale; populate the return; then toggle back. We have to use LC_ALL
- * instead of the individual categories because of another bug in Windows.
+ /* This is a workaround for another bug in Windows. localeconv() was
+ * broken with thread-safe locales prior to VS 15. It looks at the global
+ * locale instead of the thread one. As a work-around, we toggle to the
+ * global locale; populate the return; then toggle back. We have to use
+ * LC_ALL instead of the individual categories because of yet another bug
+ * in Windows. And this all has to be done in a critical section.
*
* This introduces a potential race with any other thread that has also
* converted to use the global locale, and doesn't protect its locale calls
@@ -3349,25 +3660,11 @@ S_my_localeconv(pTHX_ const int item)
* if the code is ported from working on another platform where there might
* be some reason to do this. But this is typically due to some
* alien-to-Perl library that thinks it owns locale setting. Such a
- * library usn't likely to exist on Windows, so such an application is
+ * library isn't likely to exist on Windows, so such an application is
* unlikely to be run on Windows
*/
bool restore_per_thread = FALSE;
-# ifdef USE_LOCALE_NUMERIC
-
- const char * orig_switched_locale = NULL;
-
- LC_NUMERIC_LOCK(0);
-
- /* When called internally, are already switched into the proper numeric
- * locale; otherwise must toggle to it */
- if (is_localeconv_call) {
- orig_switched_locale = toggle_locale_c(LC_NUMERIC, PL_numeric_name);
- }
-
-# endif
-
/* Save the per-thread locale state */
const char * save_thread = querylocale_c(LC_ALL);
@@ -3383,270 +3680,113 @@ S_my_localeconv(pTHX_ const int item)
const char * save_global = querylocale_c(LC_ALL);
void_setlocale_c(LC_ALL, save_thread);
- /* Safely stash the desired data */
- gwLOCALE_LOCK;
- retval = copy_localeconv(aTHX_ localeconv(),
- item,
- numeric_locale_is_utf8,
- monetary_locale_is_utf8);
- gwLOCALE_UNLOCK;
+# endif /* TS_W32_BROKEN_LOCALECONV */
- /* Restore the global locale's prior state */
- void_setlocale_c(LC_ALL, save_global);
+ /* Finally, do the actual localeconv */
+ const char *lcbuf_as_string = (const char *) localeconv();
- /* And back to per-thread locales */
- if (restore_per_thread) {
- _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
- }
-
- /* Restore the per-thread locale state */
- void_setlocale_c(LC_ALL, save_thread);
-
-# ifdef USE_LOCALE_NUMERIC
-
- if (orig_switched_locale) {
- restore_toggled_locale_c(LC_NUMERIC, orig_switched_locale);
- }
- LC_NUMERIC_UNLOCK;
-
-# endif
-
- return retval;
-
-# endif
-/*--------------------------------------------------------------------------*/
-}
+ /* Fill in the string fields of the HV* */
+ for (unsigned int i = 0; i < 2; i++) {
-STATIC HV *
-S_populate_localeconv(pTHX_ const struct lconv *lcbuf,
- const int unused,
- const locale_utf8ness_t numeric_locale_is_utf8,
- const locale_utf8ness_t monetary_locale_is_utf8)
-{
- /* This returns a mortalized hash containing all the elements returned by
- * localeconv(). It is used by Perl_localeconv() and POSIX::localeconv()
- */
- PERL_UNUSED_ARG(unused);
-
- struct lconv_offset {
- const char *name;
- size_t offset;
- };
-
- /* Create e.g.,
- {"thousands_sep", STRUCT_OFFSET(struct lconv, thousands_sep)},
- */
-# define LCONV_ENTRY(name) \
- {STRINGIFY(name), STRUCT_OFFSET(struct lconv, name)}
-
- /* Set up structures containing the documented fields. One structure for
- * LC_NUMERIC-controlled strings; one for LC_MONETARY ones, and a final one
- * of just numerics. */
# ifdef USE_LOCALE_NUMERIC
- static const struct lconv_offset lconv_numeric_strings[] = {
- LCONV_ENTRY(decimal_point),
- LCONV_ENTRY(thousands_sep),
-# ifndef NO_LOCALECONV_GROUPING
- LCONV_ENTRY(grouping),
-# endif
- {NULL, 0}
- };
+ /* One iteration is only for the numeric string fields */
+ if ( i == NUMERIC_STRING_OFFSET
+ && (which_mask & INDEX_TO_BIT(LC_NUMERIC_INDEX_)) == 0)
+ {
+ continue;
+ }
# endif
# ifdef USE_LOCALE_MONETARY
- static const struct lconv_offset lconv_monetary_strings[] = {
- LCONV_ENTRY(int_curr_symbol),
- LCONV_ENTRY(currency_symbol),
- LCONV_ENTRY(mon_decimal_point),
-# ifndef NO_LOCALECONV_MON_THOUSANDS_SEP
- LCONV_ENTRY(mon_thousands_sep),
-# endif
-# ifndef NO_LOCALECONV_MON_GROUPING
- LCONV_ENTRY(mon_grouping),
-# endif
- LCONV_ENTRY(positive_sign),
- LCONV_ENTRY(negative_sign),
- {NULL, 0}
- };
-
-# endif
-
- static const struct lconv_offset lconv_integers[] = {
-# ifdef USE_LOCALE_MONETARY
- LCONV_ENTRY(int_frac_digits),
- LCONV_ENTRY(frac_digits),
- LCONV_ENTRY(p_cs_precedes),
- LCONV_ENTRY(p_sep_by_space),
- LCONV_ENTRY(n_cs_precedes),
- LCONV_ENTRY(n_sep_by_space),
- LCONV_ENTRY(p_sign_posn),
- LCONV_ENTRY(n_sign_posn),
-# ifdef HAS_LC_MONETARY_2008
- LCONV_ENTRY(int_p_cs_precedes),
- LCONV_ENTRY(int_p_sep_by_space),
- LCONV_ENTRY(int_n_cs_precedes),
- LCONV_ENTRY(int_n_sep_by_space),
- LCONV_ENTRY(int_p_sign_posn),
- LCONV_ENTRY(int_n_sign_posn),
-# endif
-# endif
- {NULL, 0}
- };
+ /* The other iteration is only for the monetary string fields */
+ if ( i == MONETARY_STRING_OFFSET
+ && (which_mask & INDEX_TO_BIT(LC_MONETARY_INDEX_)) == 0)
+ {
+ continue;
+ }
- static const unsigned category_indices[] = {
-# ifdef USE_LOCALE_NUMERIC
- LC_NUMERIC_INDEX_,
-# endif
-# ifdef USE_LOCALE_MONETARY
- LC_MONETARY_INDEX_,
# endif
- (unsigned) -1 /* Just so the previous element can always end with a
- comma => subtract 1 below for the max loop index */
- };
- const char *ptr = (const char *) lcbuf;
- const struct lconv_offset *integers = lconv_integers;
-
- HV * retval = newHV();
- sv_2mortal((SV*)retval);
+ /* For each field for the given category ... */
+ const lconv_offset_t * category_strings = strings[i];
+ while (1) {
+ const char * name = category_strings->name;
+ if (! name) { /* Quit at the end */
+ break;
+ }
- PERL_ARGS_ASSERT_POPULATE_LOCALECONV;
+ /* we have set things up so that we know where in the returned
+ * structure, when viewed as a string, the corresponding value is.
+ * */
+ const char *value = *((const char **)( lcbuf_as_string
+ + category_strings->offset));
- /* For each enabled category ... */
- for (PERL_UINT_FAST8_T i = 0; i < C_ARRAY_LENGTH(category_indices) - 1; i++) {
- const unsigned cat_index = category_indices[i];
- locale_utf8ness_t locale_is_utf8 = LOCALE_UTF8NESS_UNKNOWN;
- const char *locale;
+ /* Set to get next string on next iteration */
+ category_strings++;
- /* ( = NULL silences a compiler warning; would segfault if it could
- * actually happen.) */
- const struct lconv_offset *strings = NULL;
+ /* Skip if this platform doesn't have this field. */
+ if (! value) {
+ continue;
+ }
-# ifdef USE_LOCALE_NUMERIC
- if (cat_index == LC_NUMERIC_INDEX_) {
- locale_is_utf8 = numeric_locale_is_utf8;
- strings = lconv_numeric_strings;
+ /* Copy to the hash */
+ (void) hv_store(hv,
+ name, strlen(name),
+ newSVpv(value, strlen(value)),
+ 0);
}
-# else
- PERL_UNUSED_ARG(numeric_locale_is_utf8);
-# endif
-# ifdef USE_LOCALE_MONETARY
- if (cat_index == LC_MONETARY_INDEX_) {
- locale_is_utf8 = monetary_locale_is_utf8;
- strings = lconv_monetary_strings;
- }
-# else
- PERL_UNUSED_ARG(monetary_locale_is_utf8);
-# endif
- assert(locale_is_utf8 != LOCALE_UTF8NESS_UNKNOWN);
-
- /* Iterate over the strings structure for this category */
- locale = querylocale_i(cat_index);
-
- while (strings->name) {
- const char *value = *((const char **)(ptr + strings->offset));
- if (value) {
- bool is_utf8 = /* Only make UTF-8 if required to */
- (UTF8NESS_YES == (get_locale_string_utf8ness_i(value,
- locale_is_utf8,
- locale,
- cat_index)));
- (void) hv_store(retval,
- strings->name,
- strlen(strings->name),
- newSVpvn_utf8(value, strlen(value), is_utf8),
- 0);
+ /* Add any int fields to the HV* */
+ if (i == MONETARY_STRING_OFFSET && integers) {
+ while (integers->name) {
+ const char value = *((const char *)( lcbuf_as_string
+ + integers->offset));
+ (void) hv_store(hv, integers->name,
+ strlen(integers->name), newSViv(value), 0);
+ integers++;
}
-
- strings++;
}
- }
-
- while (integers->name) {
- const char value = *((const char *)(ptr + integers->offset));
-
- (void) hv_store(retval, integers->name,
- strlen(integers->name), newSViv(value), 0);
- integers++;
- }
+ } /* End of loop through the fields */
- return retval;
-}
+ /* Done with copying to the hash. Can unwind the critical section locks */
-# ifndef HAS_SOME_LANGINFO
+# ifdef TS_W32_BROKEN_LOCALECONV
-STATIC HV *
-S_get_nl_item_from_localeconv(pTHX_ const struct lconv *lcbuf,
- const int item,
- const locale_utf8ness_t unused1,
- const locale_utf8ness_t unused2)
-{
- /* This is a helper function for my_localeconv(), which is called from
- * my_langinfo() to emulate the libc nl_langinfo() function on platforms
- * that don't have it available.
- *
- * This function acts as an extension to my_langinfo(), the intermediate
- * my_localeconv() call is to set up the locks and switch into the proper
- * locale. That logic exists for other reasons, and by doing it this way,
- * it doesn't have to be duplicated.
- *
- * This function extracts the current value of 'item' in the current locale
- * using the localconv() result also passed in, via 'lcbuf'. The other
- * parameter is unused, a placeholder so the signature of this function
- * matches another that does need it, and so the two functions can be
- * referred to by a single function pointer, to simplify the code below */
+ /* Restore the global locale's prior state */
+ void_setlocale_c(LC_ALL, save_global);
- const char * prefix = "";
- const char * temp = NULL;
+ /* And back to per-thread locales */
+ if (restore_per_thread) {
+ _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
+ }
- PERL_ARGS_ASSERT_GET_NL_ITEM_FROM_LOCALECONV;
- PERL_UNUSED_ARG(unused1);
- PERL_UNUSED_ARG(unused2);
+ /* Restore the per-thread locale state */
+ void_setlocale_c(LC_ALL, save_thread);
- switch (item) {
- case CRNCYSTR:
- temp = lcbuf->currency_symbol;
+# endif /* TS_W32_BROKEN_LOCALECONV */
- if (lcbuf->p_cs_precedes) {
+ gwLOCALE_UNLOCK; /* Finished with the critical section of a
+ globally-accessible buffer */
- /* khw couldn't find any documentation that CHAR_MAX is the signal,
- * but cygwin uses it thusly */
- if (lcbuf->p_cs_precedes == CHAR_MAX) {
- prefix = ".";
- }
- else {
- prefix = "-";
- }
- }
- else {
- prefix = "+";
- }
+# ifdef USE_LOCALE_NUMERIC
- break;
+ restore_toggled_locale_i(LC_NUMERIC_INDEX_, orig_NUMERIC_locale);
+ if (which_mask & INDEX_TO_BIT(LC_NUMERIC_INDEX_)) {
+ LC_NUMERIC_UNLOCK;
+ }
- case RADIXCHAR:
- temp = lcbuf->decimal_point;
- break;
+# endif
+# ifdef USE_LOCALE_CTYPE
- case THOUSEP:
- temp = lcbuf->thousands_sep;
- break;
+ restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
- default:
- locale_panic_(Perl_form(aTHX_
- "Unexpected item passed to populate_localeconv: %d", item));
- }
+# endif
- return (HV *) Perl_newSVpvf(aTHX_ "%s%s", prefix, temp);
}
-# endif /* ! Has some form of langinfo() */
-#endif /* Has some form of localeconv() and paying attn to a category it
- traffics in */
-
+#endif /* defined(HAS_LOCALECONV) */
#ifndef HAS_SOME_LANGINFO
typedef int nl_item; /* Substitute 'int' for emulated nl_langinfo() */
@@ -4158,24 +4298,78 @@ S_my_langinfo_i(pTHX_
* TS_W32_BROKEN_LOCALECONV, one could use GetNumberFormat and
* GetCurrencyFormat; patches welcome) */
+# define P_CS_PRECEDES "p_cs_precedes"
+# define CURRENCY_SYMBOL "currency_symbol"
+
/* case RADIXCHAR: // May drop down to here in some configurations */
- case CRNCYSTR:
case THOUSEP:
- {
- SV * string = (SV *) my_localeconv(item);
+ case CRNCYSTR:
+ {
+
+ /* The hash gets populated with just the field(s) related to 'item'. */
+ HV * result_hv = my_localeconv(item);
- retval = save_to_buffer(SvPV_nolen(string), retbufp, retbuf_sizep);
+ SV* string;
+ if (item != CRNCYSTR) {
- if (utf8ness) {
- is_utf8 = get_locale_string_utf8ness_i(retval,
- LOCALE_UTF8NESS_UNKNOWN,
- locale, cat_index);
+ /* These items have been populated with just one key => value */
+ (void) hv_iterinit(result_hv);
+ HE * entry = hv_iternext(result_hv);
+ string = hv_iterval(result_hv, entry);
+ }
+ else {
+
+ /* But CRNCYSTR localeconv() returns a slightly different value
+ * than the nl_langinfo() API calls for, so have to modify this one
+ * to conform. We need another value from localeconv() to know
+ * what to change it to. my_localeconv() has populated the hash
+ * with exactly both fields. Delete this one, leaving just the
+ * CRNCYSTR one in the hash */
+ SV* precedes = hv_delete(result_hv,
+ P_CS_PRECEDES, STRLENs(P_CS_PRECEDES),
+ 0);
+ if (! precedes) {
+ locale_panic_("my_localeconv() unexpectedly didn't return"
+ " a value for " P_CS_PRECEDES);
}
- SvREFCNT_dec_NN(string);
- break;
+ /* The modification is to prefix the localeconv() return with a
+ * single byte, calculated as follows: */
+ char prefix = (LIKELY(SvIV(precedes) != CHAR_MAX))
+ ? ((precedes != 0) ? '-' : '+')
+
+ /* khw couldn't find any documentation that
+ * CHAR_MAX is the signal, but cygwin uses it
+ * thusly, and it makes sense given that CHAR_MAX
+ * indicates the value isn't used, so it neither
+ * precedes nor succeeds */
+ : '.';
+
+ /* Now get CRNCYSTR */
+ (void) hv_iterinit(result_hv);
+ HE * entry = hv_iternext(result_hv);
+ string = hv_iterval(result_hv, entry);
+
+ /* And perform the modification */
+ Perl_sv_setpvf(aTHX_ string, "%c%s", prefix, SvPV_nolen(string));
+ }
+
+ /* Here, 'string' contains the value we want to return */
+ retval = save_to_buffer(SvPV_nolen(string), retbufp, retbuf_sizep);
+
+ if (utf8ness) {
+ is_utf8 = (SvUTF8(string))
+ ? UTF8NESS_YES
+ : (is_utf8_invariant_string( (U8 *) retval,
+ strlen(retval)))
+ ? UTF8NESS_IMMATERIAL
+ : UTF8NESS_NO;
}
+ break;
+
+ }
+
# endif /* Some form of localeconv */
# ifdef HAS_STRFTIME