diff options
-rw-r--r-- | embedvar.h | 1 | ||||
-rw-r--r-- | intrpvar.h | 1 | ||||
-rw-r--r-- | locale.c | 35 | ||||
-rw-r--r-- | perl.c | 2 | ||||
-rw-r--r-- | perl.h | 23 | ||||
-rw-r--r-- | pod/perldelta.pod | 9 | ||||
-rw-r--r-- | pod/perldiag.pod | 20 | ||||
-rw-r--r-- | pp.c | 23 | ||||
-rw-r--r-- | regexec.c | 34 | ||||
-rw-r--r-- | sv.c | 3 | ||||
-rw-r--r-- | utf8.c | 51 |
11 files changed, 166 insertions, 36 deletions
diff --git a/embedvar.h b/embedvar.h index 32a8b9b327..da3c331634 100644 --- a/embedvar.h +++ b/embedvar.h @@ -352,6 +352,7 @@ #define PL_utf8_xidstart (vTHX->Iutf8_xidstart) #define PL_utf8cache (vTHX->Iutf8cache) #define PL_utf8locale (vTHX->Iutf8locale) +#define PL_warn_locale (vTHX->Iwarn_locale) #define PL_warnhook (vTHX->Iwarnhook) #define PL_watchaddr (vTHX->Iwatchaddr) #define PL_watchok (vTHX->Iwatchok) diff --git a/intrpvar.h b/intrpvar.h index 3bb1c9af8b..eb962836ca 100644 --- a/intrpvar.h +++ b/intrpvar.h @@ -238,6 +238,7 @@ PERLVAR(I, exit_flags, U8) /* was exit() unexpected, etc. */ PERLVAR(I, utf8locale, bool) /* utf8 locale detected */ PERLVAR(I, in_utf8_CTYPE_locale, bool) +PERLVAR(I, warn_locale, SV *) PERLVARA(I, colors,6, char *) /* values from PERL_RE_COLORS env var */ @@ -292,6 +292,8 @@ Perl_new_ctype(pTHX_ const char *newctype) to start */ unsigned int bad_count = 0; /* Count of bad characters */ + SvREFCNT_dec(PL_warn_locale); /* We are about to overwrite this */ + for (i = 0; i < 256; i++) { if (isUPPER_LC((U8) i)) PL_fold_locale[i] = (U8) toLOWER_LC((U8) i); @@ -360,17 +362,9 @@ Perl_new_ctype(pTHX_ const char *newctype) #endif if (bad_count || multi_byte_locale) { - - /* We have to save 'newctype' because the setlocale() just below - * may destroy it. The next setlocale() further down should - * restore it properly so that the intermediate change here is - * transparent to this function's caller */ - const char * const badlocale = savepv(newctype); - - setlocale(LC_CTYPE, "C"); - Perl_warner(aTHX_ packWARN(WARN_LOCALE), + PL_warn_locale = Perl_newSVpvf(aTHX_ "Locale '%s' may not work well.%s%s%s\n", - badlocale, + newctype, (multi_byte_locale) ? " Some characters in it are not recognized by" " Perl." @@ -384,7 +378,26 @@ Perl_new_ctype(pTHX_ const char *newctype) ? bad_chars_list : "" ); - setlocale(LC_CTYPE, badlocale); + /* If we are actually in the scope of the locale, output the + * message now. Otherwise we save it to be output at the first + * operation using this locale, if that actually happens. Most + * programs don't use locales, so they are immune to bad ones */ + if (IN_LC(LC_CTYPE)) { + + /* We have to save 'newctype' because the setlocale() just + * below may destroy it. The next setlocale() further down + * should restore it properly so that the intermediate change + * here is transparent to this function's caller */ + const char * const badlocale = savepv(newctype); + + setlocale(LC_CTYPE, "C"); + + /* The '0' below suppresses a bogus gcc compiler warning */ + Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0); + setlocale(LC_CTYPE, badlocale); + SvREFCNT_dec_NN(PL_warn_locale); + PL_warn_locale = NULL; + } } } @@ -1040,6 +1040,7 @@ perl_destruct(pTHXx) SvREFCNT_dec(PL_Latin1); SvREFCNT_dec(PL_NonL1NonFinalFold); SvREFCNT_dec(PL_HasMultiCharFold); + SvREFCNT_dec(PL_warn_locale); PL_utf8_mark = NULL; PL_utf8_toupper = NULL; PL_utf8_totitle = NULL; @@ -1051,6 +1052,7 @@ perl_destruct(pTHXx) PL_AboveLatin1 = NULL; PL_InBitmap = NULL; PL_HasMultiCharFold = NULL; + PL_warn_locale = NULL; PL_Latin1 = NULL; PL_NonL1NonFinalFold = NULL; PL_UpperLatin1 = NULL; @@ -5779,6 +5779,27 @@ typedef struct am_table_short AMTS; # define IN_LC(category) \ (IN_LC_COMPILETIME(category) || IN_LC_RUNTIME(category)) +# if defined (PERL_CORE) || defined (PERL_IN_XSUB_RE) + + /* This internal macro should be called from places that operate under + * locale rules. It there is a problem with the current locale that + * hasn't been raised yet, it will output a warning this time */ +# define _CHECK_AND_WARN_PROBLEMATIC_LOCALE \ + STMT_START { \ + if (PL_warn_locale) { \ + /*GCC_DIAG_IGNORE(-Wformat-security); Didn't work */ \ + Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \ + SvPVX(PL_warn_locale), \ + 0 /* dummy to avoid comp warning */ ); \ + /* GCC_DIAG_RESTORE; */ \ + SvREFCNT_dec_NN(PL_warn_locale); \ + PL_warn_locale = NULL; \ + } \ + } STMT_END + + +# endif /* PERL_CORE or PERL_IN_XSUB_RE */ + #else /* No locale usage */ # define IN_LOCALE_RUNTIME 0 # define IN_SOME_LOCALE_FORM_RUNTIME 0 @@ -5793,6 +5814,8 @@ typedef struct am_table_short AMTS; # define IN_LC_COMPILETIME(category) 0 # define IN_LC_RUNTIME(category) 0 # define IN_LC(category) 0 + +# define _CHECK_AND_WARN_PROBLEMATIC_LOCALE #endif #ifdef USE_LOCALE_NUMERIC diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 6eecc008f8..a49456576a 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -221,6 +221,15 @@ XXX Changes (i.e. rewording) of diagnostic messages go here XXX Describe change here +The message +L<Locale '%s' may not work well.%s|perldiag/"Locale '%s' may not work well.%s"> +is no longer raised unless the problemtatic locale is actually used in +the Perl program. Previously it was raised if it merely was the +underlying locale. All Perl programs have an underlying locale at all +times, but something like a C<S<use locale>> is needed for that locale +to actually have some effect. This message will not be raised when +the underlying locale is hidden. + =back =head1 Utility Changes diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 1c845dde44..63df68d591 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -2982,16 +2982,16 @@ likely fix this error. =item Locale '%s' may not work well.%s -(W locale) The named locale that Perl is now trying to use is not fully -compatible with Perl. The second C<%s> gives a reason. +(W locale) You are using the named locale, which is a non-UTF-8 one, and +which Perl has determined is not fully compatible with Perl. The second +C<%s> gives a reason. By far the most common reason is that the locale has characters in it that are represented by more than one byte. The only such locales that Perl can handle are the UTF-8 locales. Most likely the specified locale is a non-UTF-8 one for an East Asian language such as Chinese or Japanese. If the locale is a superset of ASCII, the ASCII portion of it -may work in Perl. Read on for problems when it isn't a superset of -ASCII. +may work in Perl. Some essentially obsolete locales that aren't supersets of ASCII, mainly those in ISO 646 or other 7-bit locales, such as ASMO 449, can also have @@ -2999,6 +2999,18 @@ problems, depending on what portions of the ASCII character set get changed by the locale and are also used by the program. The warning message lists the determinable conflicting characters. +Note that not all incompatibilities are found. + +If this happens to you, there's not much you can do except switch to use a +different locale or use L<Encode> to translate from the locale into +UTF-8; if that's impracticable, you have been warned that some things +may break. + +This message is output once each time a bad locale is switched into +within the scope of C<S<use locale>>, or on the first possibly-affected +operation if the C<S<use locale>> inherits a bad one. It is not raised +for any operations from the L<POSIX> module. + =item localtime(%f) failed (W overflow) You called C<localtime> with a number that it could not handle: @@ -3588,23 +3588,27 @@ PP(pp_ucfirst) if (op_type == OP_LCFIRST) { /* lower case the first letter: no trickiness for any character */ - *tmpbuf = #ifdef USE_LOCALE_CTYPE - (IN_LC_RUNTIME(LC_CTYPE)) - ? toLOWER_LC(*s) - : + if (IN_LC_RUNTIME(LC_CTYPE)) { + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + *tmpbuf = toLOWER_LC(*s); + } + else #endif - (IN_UNI_8_BIT) - ? toLOWER_LATIN1(*s) - : toLOWER(*s); + { + *tmpbuf = (IN_UNI_8_BIT) + ? toLOWER_LATIN1(*s) + : toLOWER(*s); + } } - /* is ucfirst() */ #ifdef USE_LOCALE_CTYPE + /* is ucfirst() */ else if (IN_LC_RUNTIME(LC_CTYPE)) { if (IN_UTF8_CTYPE_LOCALE) { goto do_uni_rules; } + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; *tmpbuf = (U8) toUPPER_LC(*s); /* This would be a bug if any locales have upper and title case different */ @@ -3909,6 +3913,7 @@ PP(pp_uc) if (IN_UTF8_CTYPE_LOCALE) { goto do_uni_rules; } + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; for (; s < send; d++, s++) *d = (U8) toUPPER_LC(*s); } @@ -4116,6 +4121,7 @@ PP(pp_lc) * whole thing in a tight loop, for speed, */ #ifdef USE_LOCALE_CTYPE if (IN_LC_RUNTIME(LC_CTYPE)) { + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; for (; s < send; d++, s++) *d = toLOWER_LC(*s); } @@ -4298,6 +4304,7 @@ PP(pp_fc) if (IN_UTF8_CTYPE_LOCALE) { goto do_uni_folding; } + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; for (; s < send; d++, s++) *d = (U8) toFOLD_LC(*s); } @@ -1456,6 +1456,7 @@ STMT_START { U8 flags = FOLD_FLAGS_FULL; \ switch (trie_type) { \ case trie_flu8: \ + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \ goto do_trie_utf8_fold; \ case trie_utf8_exactfa_fold: \ flags |= FOLD_FLAGS_NOMIX_ASCII; \ @@ -1493,6 +1494,8 @@ STMT_START { } \ break; \ case trie_utf8l: \ + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \ + /* FALLTHROUGH */ \ case trie_utf8: \ uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ break; \ @@ -1753,6 +1756,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* We know what class it must start with. */ switch (OP(c)) { case ANYOFL: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + /* FALLTHROUGH */ case ANYOF: if (utf8_target) { REXEC_FBC_UTF8_CLASS_SCAN( @@ -1794,6 +1799,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, goto do_exactf_non_utf8; case EXACTFL: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) { utf8_fold_flags = FOLDEQ_LOCALE; goto do_exactf_utf8; @@ -1921,9 +1927,11 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } case BOUNDL: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8); break; case NBOUNDL: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8); break; case BOUND: @@ -1958,6 +1966,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* FALLTHROUGH */ case POSIXL: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)), to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s))); break; @@ -4174,6 +4183,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]); U32 state = trie->startstate; + if (scan->flags == EXACTL || scan->flags == EXACTFLU8) { + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + } if ( trie->bitmap && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr))) { @@ -4448,6 +4460,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) #undef ST case EXACTL: /* /abc/l */ + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + /* FALLTHROUGH */ case EXACT: { /* /abc/ */ char *s = STRING(scan); ln = STR_LEN(scan); @@ -4534,6 +4548,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const char * s; U32 fold_utf8_flags; + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; folder = foldEQ_locale; fold_array = PL_fold_locale; fold_utf8_flags = FOLDEQ_LOCALE; @@ -4615,6 +4630,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * have to set the FLAGS fields of these */ case BOUNDL: /* /\b/l */ case NBOUNDL: /* /\B/l */ + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + /* FALLTHROUGH */ case BOUND: /* /\b/ */ case BOUNDU: /* /\b/u */ case BOUNDA: /* /\b/a */ @@ -4694,6 +4711,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) break; case ANYOFL: /* /[abc]/l */ + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + /* FALLTHROUGH */ case ANYOF: /* /[abc]/ */ if (NEXTCHR_IS_EOS) sayNO; @@ -4718,6 +4737,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* FALLTHROUGH */ case POSIXL: /* \w or [:punct:] etc. under /l */ + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; if (NEXTCHR_IS_EOS) sayNO; @@ -5094,6 +5114,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const U8 *fold_array; UV utf8_fold_flags; + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; folder = foldEQ_locale; fold_array = PL_fold_locale; type = REFFL; @@ -5138,6 +5159,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) goto do_nref_ref_common; case REFFL: /* /\1/il */ + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; folder = foldEQ_locale; fold_array = PL_fold_locale; utf8_fold_flags = FOLDEQ_LOCALE; @@ -7208,6 +7230,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } break; case EXACTL: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + /* FALLTHROUGH */ case EXACT: assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); @@ -7281,6 +7305,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, goto do_exactf; case EXACTFL: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; utf8_flags = FOLDEQ_LOCALE; goto do_exactf; @@ -7360,6 +7385,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, break; } case ANYOFL: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + /* FALLTHROUGH */ case ANYOF: if (utf8_target) { while (hardcount < max @@ -7382,6 +7409,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* FALLTHROUGH */ case POSIXL: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; if (! utf8_target) { while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), *scan))) @@ -7601,16 +7629,18 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } break; + case BOUNDL: + case NBOUNDL: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + /* FALLTHROUGH */ case BOUND: case BOUNDA: - case BOUNDL: case BOUNDU: case EOS: case GPOS: case KEEPS: case NBOUND: case NBOUNDA: - case NBOUNDL: case NBOUNDU: case OPFAIL: case SBOL: @@ -14588,6 +14588,9 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags, /* Unicode features (see perlrun/-C) */ PL_unicode = proto_perl->Iunicode; + /* Should we warn if uses locale? */ + PL_warn_locale = proto_perl->Iwarn_locale; + /* Pre-5.8 signals control */ PL_signals = proto_perl->Isignals; @@ -1600,9 +1600,14 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags) PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS; - /* Treat a UTF-8 locale as not being in locale at all */ - if (IN_UTF8_CTYPE_LOCALE) { - flags &= ~FOLD_FLAGS_LOCALE; + if (flags & FOLD_FLAGS_LOCALE) { + /* Treat a UTF-8 locale as not being in locale at all */ + if (IN_UTF8_CTYPE_LOCALE) { + flags &= ~FOLD_FLAGS_LOCALE; + } + else { + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + } } if (c < 256) { @@ -1949,8 +1954,14 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS; - if (flags && IN_UTF8_CTYPE_LOCALE) { - flags = FALSE; + if (flags) { + /* Treat a UTF-8 locale as not being in locale at all */ + if (IN_UTF8_CTYPE_LOCALE) { + flags = FALSE; + } + else { + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + } } if (UTF8_IS_INVARIANT(*p)) { @@ -2014,8 +2025,14 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS; - if (flags && IN_UTF8_CTYPE_LOCALE) { - flags = FALSE; + if (flags) { + /* Treat a UTF-8 locale as not being in locale at all */ + if (IN_UTF8_CTYPE_LOCALE) { + flags = FALSE; + } + else { + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + } } if (UTF8_IS_INVARIANT(*p)) { @@ -2078,8 +2095,14 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS; - if (flags && IN_UTF8_CTYPE_LOCALE) { - flags = FALSE; + if (flags) { + /* Treat a UTF-8 locale as not being in locale at all */ + if (IN_UTF8_CTYPE_LOCALE) { + flags = FALSE; + } + else { + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + } } if (UTF8_IS_INVARIANT(*p)) { @@ -2153,8 +2176,14 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) assert(p != ustrp); /* Otherwise overwrites */ - if (flags & FOLD_FLAGS_LOCALE && IN_UTF8_CTYPE_LOCALE) { - flags &= ~FOLD_FLAGS_LOCALE; + if (flags & FOLD_FLAGS_LOCALE) { + /* Treat a UTF-8 locale as not being in locale at all */ + if (IN_UTF8_CTYPE_LOCALE) { + flags &= ~FOLD_FLAGS_LOCALE; + } + else { + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + } } if (UTF8_IS_INVARIANT(*p)) { |