diff options
-rw-r--r-- | embed.fnc | 2 | ||||
-rw-r--r-- | ext/I18N-Langinfo/Langinfo.xs | 64 | ||||
-rw-r--r-- | locale.c | 123 | ||||
-rw-r--r-- | pod/perldelta.pod | 7 | ||||
-rw-r--r-- | proto.h | 4 |
5 files changed, 83 insertions, 117 deletions
@@ -1646,8 +1646,10 @@ ATdo |const char*|Perl_setlocale|const int category|NULLOK const char* locale Ado |HV * |Perl_localeconv #if defined(HAS_NL_LANGINFO) && defined(PERL_LANGINFO_H) ATdo |const char*|Perl_langinfo|const nl_item item +ATdo |const char*|Perl_langinfo8|const nl_item item|NULLOK utf8ness_t * utf8ness #else ATdo |const char*|Perl_langinfo|const int item +ATdo |const char*|Perl_langinfo8|const int item|NULLOK utf8ness_t * utf8ness #endif pEX |int |mbtowc_|NULLOK const wchar_t * pwc|NULLOK const char * s|const Size_t len CpO |int |init_i18nl10n |int printwarn diff --git a/ext/I18N-Langinfo/Langinfo.xs b/ext/I18N-Langinfo/Langinfo.xs index 904b424b19..3b05e9f791 100644 --- a/ext/I18N-Langinfo/Langinfo.xs +++ b/ext/I18N-Langinfo/Langinfo.xs @@ -25,8 +25,8 @@ SV* langinfo(code) int code PREINIT: - const char * value; - STRLEN len; + const char * value; + utf8ness_t is_utf8; PROTOTYPE: _ CODE: #ifdef HAS_NL_LANGINFO @@ -36,64 +36,8 @@ langinfo(code) } else #endif { - value = Perl_langinfo(code); - len = strlen(value); - RETVAL = newSVpvn(Perl_langinfo(code), len); - - /* Now see if the UTF-8 flag should be turned on */ -#ifdef USE_LOCALE_CTYPE /* No utf8 strings if not using LC_CTYPE */ - - /* If 'value' is ASCII or not legal UTF-8, the flag doesn't get - * turned on, so skip the followin code */ - if (is_utf8_non_invariant_string((U8 *) value, len)) { - int category; - - /* Check if the locale is a UTF-8 one. The returns from - * Perl_langinfo() are in different locale categories, so check the - * category corresponding to this item */ - switch (code) { - - /* This should always return ASCII, so we could instead - * legitimately panic here, but soldier on */ - case CODESET: - category = LC_CTYPE; - break; - - case RADIXCHAR: - case THOUSEP: -# ifdef USE_LOCALE_NUMERIC - category = LC_NUMERIC; -# else - /* Not ideal, but the best we can do on such a platform */ - category = LC_CTYPE; -# endif - break; - - case CRNCYSTR: -# ifdef USE_LOCALE_MONETARY - category = LC_MONETARY; -# else - category = LC_CTYPE; -# endif - break; - - default: -# ifdef USE_LOCALE_TIME - category = LC_TIME; -# else - category = LC_CTYPE; -# endif - break; - } - - /* Here the return is legal UTF-8. Turn on that flag if the - * locale is UTF-8. (Otherwise, could just be a coincidence.) - * */ - if (_is_cur_LC_category_utf8(category)) { - SvUTF8_on(RETVAL); - } - } -#endif /* USE_LOCALE_CTYPE */ + value = Perl_langinfo8(code, &is_utf8); + RETVAL = newSVpvn_utf8(value, strlen(value), is_utf8 == UTF8NESS_YES); } OUTPUT: @@ -3498,32 +3498,43 @@ S_get_nl_item_from_localeconv(pTHX_ const struct lconv *lcbuf, #endif /* Has some form of localeconv() and paying attn to a category it traffics in */ +#ifndef HAS_SOME_LANGINFO + +typedef int nl_item; /* Substitute 'int' for emulated nl_langinfo() */ + +#endif + /* -=for apidoc Perl_langinfo +=for apidoc Perl_langinfo +=for apidoc_item Perl_langinfo8 + +C<Perl_langinfo> is an (almost) drop-in replacement for the system +C<L<nl_langinfo(3)>>, taking the same C<item> parameter values, and returning +the same information. But it is more thread-safe than regular +C<nl_langinfo()>, and hides the quirks of Perl's locale handling from your +code, and can be used on systems that lack a native C<nl_langinfo>. -This is an (almost) drop-in replacement for the system C<L<nl_langinfo(3)>>, -taking the same C<item> parameter values, and returning the same information. -But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks -of Perl's locale handling from your code, and can be used on systems that lack -a native C<nl_langinfo>. +However, you should instead use the improved version of this: +L</Perl_langinfo8>, which behaves identically except for an additional +parameter, a pointer to a variable declared as L</C<utf8ness_t>>, into which it +returns to you how you should treat the returned string with regards to it +being encoded in UTF-8 or not. -Expanding on these: +Concerning the differences between these and plain C<nl_langinfo()>: =over -=item * +=item a. -The reason it isn't quite a drop-in replacement is actually an advantage. The -only difference is that it returns S<C<const char *>>, whereas plain -C<nl_langinfo()> returns S<C<char *>>, but you are (only by documentation) -forbidden to write into the buffer. By declaring this C<const>, the compiler -enforces this restriction, so if it is violated, you know at compilation time, -rather than getting segfaults at runtime. +C<Perl_langinfo8> has an extra parameter, described above. Besides this, the +other reasons they aren't quite a drop-in replacement is actually an advantage. +The C<const>ness of the return allows the compiler to catch attempts to write +into the returned buffer, which is illegal and could cause run-time crashes. -=item * +=item b. -It delivers the correct results for the C<RADIXCHAR> and C<THOUSEP> items, +They deliver the correct results for the C<RADIXCHAR> and C<THOUSEP> items, without you having to write extra code. The reason for the extra code would be because these are from the C<LC_NUMERIC> locale category, which is normally kept set by Perl so that the radix is a dot, and the separator is the empty @@ -3535,50 +3546,50 @@ the other advantages of C<Perl_langinfo()>; not keeping C<LC_NUMERIC> in the C (or equivalent) locale would break a lot of CPAN, which is expecting the radix (decimal point) character to be a dot.) -=item * +=item c. -The system function it replaces can have its static return buffer trashed, +The system function they replace can have its static return buffer trashed, not only by a subsequent call to that function, but by a C<freelocale>, -C<setlocale>, or other locale change. The returned buffer of this function is -not changed until the next call to it, so the buffer is never in a trashed -state. +C<setlocale>, or other locale change. The returned buffer of these functions +is not changed until the next call to one or the other, so the buffer is never +in a trashed state. -=item * +=item d. -Its return buffer is per-thread, so it also is never overwritten by a call to -this function from another thread; unlike the function it replaces. +The return buffer is per-thread, so it also is never overwritten by a call to +these functions from another thread; unlike the function it replaces. -=item * +=item e. -But most importantly, it works on systems that don't have C<nl_langinfo>, such -as Windows, hence makes your code more portable. Of the fifty-some possible +But most importantly, they work on systems that don't have C<nl_langinfo>, such +as Windows, hence making your code more portable. Of the fifty-some possible items specified by the POSIX 2008 standard, L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>, only one is completely unimplemented, though on non-Windows platforms, another -significant one is also not implemented). It uses various techniques to +significant one is not fully implemented). They use various techniques to recover the other items, including calling C<L<localeconv(3)>>, and C<L<strftime(3)>>, both of which are specified in C89, so should be always be available. Later C<strftime()> versions have additional capabilities; C<""> is -returned for those not available on your system. +returned for any item not available on your system. -It is important to note that when called with an item that is recovered by +It is important to note that, when called with an item that is recovered by using C<localeconv>, the buffer from any previous explicit call to -C<localeconv> will be overwritten. This means you must save that buffer's -contents if you need to access them after a call to this function. (But note -that you might not want to be using C<localeconv()> directly anyway, because of -issues like the ones listed in the second item of this list (above) for -C<RADIXCHAR> and C<THOUSEP>. You can use the methods given in L<perlcall> to -call L<POSIX/localeconv> and avoid all the issues, but then you have a hash to -unpack). +C<L<localeconv(3)>> will be overwritten. But you shouldn't be using +C<localeconv> anyway because it is is very much not thread-safe, and suffers +from the same problems outlined in item 'b.' above for the fields it returns that +are controlled by the LC_NUMERIC locale category. Instead, avoid all of those +problems by calling L</Perl_localeconv>, which is thread-safe; or by using the +methods given in L<perlcall> to call +L<C<POSIX::localeconv()>|POSIX/localeconv>, which is also thread-safe. + +=back The details for those items which may deviate from what this emulation returns and what a native C<nl_langinfo()> would return are specified in L<I18N::Langinfo>. -=back - -When using C<Perl_langinfo> on systems that don't have a native -C<nl_langinfo()>, you must +When using C<Perl_langinfo8> (or plain C<Perl_langinfo>) on systems that don't +have a native C<nl_langinfo()>, you must #include "perl_langinfo.h" @@ -3587,30 +3598,28 @@ C<#include> with this one. (Doing it this way keeps out the symbols that plain C<langinfo.h> would try to import into the namespace for code that doesn't need it.) -The original impetus for C<Perl_langinfo()> was so that code that needs to -find out the current currency symbol, floating point radix character, or digit -grouping separator can use, on all systems, the simpler and more -thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a -pain to make thread-friendly. For other fields returned by C<localeconv>, it -is better to use the methods given in L<perlcall> to call -L<C<POSIX::localeconv()>|POSIX/localeconv>, which is thread-friendly. - =cut */ -#ifndef HAS_SOME_LANGINFO - -typedef int nl_item; /* Substitute 'int' for emulated nl_langinfo() */ - -#endif - const char * Perl_langinfo(const nl_item item) { + return Perl_langinfo8(item, NULL); +} + +const char * +Perl_langinfo8(const nl_item item, utf8ness_t * utf8ness) +{ dTHX; unsigned cat_index; + PERL_ARGS_ASSERT_PERL_LANGINFO8; + + if (utf8ness) { /* Assume for now */ + *utf8ness = UTF8NESS_IMMATERIAL; + } + /* Find the locale category that controls the input 'item'. If we are not * paying attention to that category, instead return a default value. Also * return the default value if there is no way for us to figure out the @@ -3774,7 +3783,7 @@ Perl_langinfo(const nl_item item) /* Use either the underlying numeric, or the other underlying categories */ if (cat_index == LC_NUMERIC_INDEX_) { return my_langinfo_c(item, LC_NUMERIC, PL_numeric_name, - &PL_langinfo_buf, &PL_langinfo_bufsize, NULL); + &PL_langinfo_buf, &PL_langinfo_bufsize, utf8ness); } else @@ -3782,7 +3791,7 @@ Perl_langinfo(const nl_item item) { return my_langinfo_i(item, cat_index, querylocale_i(cat_index), - &PL_langinfo_buf, &PL_langinfo_bufsize, NULL); + &PL_langinfo_buf, &PL_langinfo_bufsize, utf8ness); } #endif diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 82db679dd9..89af9bb80d 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -359,6 +359,13 @@ same as plain L<perlapi/C<my_strftime>>, but with an extra parameter that allows the caller to simply and reliably know if the returned string is UTF-8. +=item * + +A new API function, L<perlapi/C<Perl_langinfo8>> is added. This is the +same as plain L<perlapi/C<Perl_langinfo>>, but with an extra parameter +that allows the caller to simply and reliably know if the returned +string is UTF-8. + =back =head1 Selected Bug Fixes @@ -4681,6 +4681,8 @@ STATIC int S_sv_2iuv_non_preserve(pTHX_ SV *const sv); #if !(defined(HAS_NL_LANGINFO) && defined(PERL_LANGINFO_H)) PERL_CALLCONV const char* Perl_langinfo(const int item); #define PERL_ARGS_ASSERT_PERL_LANGINFO +PERL_CALLCONV const char* Perl_langinfo8(const int item, utf8ness_t * utf8ness); +#define PERL_ARGS_ASSERT_PERL_LANGINFO8 #endif #if !(defined(HAS_NL_LANGINFO) || defined(HAS_NL_LANGINFO_L)) # if defined(PERL_IN_LOCALE_C) @@ -5222,6 +5224,8 @@ PERL_CALLCONV I32 Perl_do_shmio(pTHX_ I32 optype, SV** mark, SV** sp) #if defined(HAS_NL_LANGINFO) && defined(PERL_LANGINFO_H) PERL_CALLCONV const char* Perl_langinfo(const nl_item item); #define PERL_ARGS_ASSERT_PERL_LANGINFO +PERL_CALLCONV const char* Perl_langinfo8(const nl_item item, utf8ness_t * utf8ness); +#define PERL_ARGS_ASSERT_PERL_LANGINFO8 #endif #if defined(HAS_NL_LANGINFO) || defined(HAS_NL_LANGINFO_L) # if defined(PERL_IN_LOCALE_C) |