diff options
author | Karl Williamson <public@khwilliamson.com> | 2012-02-15 11:31:27 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2012-02-15 18:02:35 -0700 |
commit | 2e2b25717dbde8d9ce48b4b8dc443e1d08166347 (patch) | |
tree | ca10f48aa5a2fa0549aebebed4109a9d8c59aa24 /pp.c | |
parent | adfec83175578461303ab5cfcc90d37cb3114126 (diff) | |
download | perl-2e2b25717dbde8d9ce48b4b8dc443e1d08166347.tar.gz |
perl #77654: quotemeta quotes non-ASCII consistently
As described in the pod changes in this commit, this changes quotemeta()
to consistenly quote non-ASCII characters when used under
unicode_strings. The behavior is changed for these and UTF-8 encoded
strings to more closely align with Unicode's recommendations.
The end result is that we *could* at some future point start using other
characters as metacharacters than the 12 we do now.
Diffstat (limited to 'pp.c')
-rw-r--r-- | pp.c | 37 |
1 files changed, 29 insertions, 8 deletions
@@ -4088,24 +4088,45 @@ PP(pp_quotemeta) d = SvPVX(TARG); if (DO_UTF8(sv)) { while (len) { - if (UTF8_IS_CONTINUED(*s)) { STRLEN ulen = UTF8SKIP(s); + bool to_quote = FALSE; + + if (UTF8_IS_INVARIANT(*s)) { + if (_isQUOTEMETA(*s)) { + to_quote = TRUE; + } + } + else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { + if (_isQUOTEMETA(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)))) + { + to_quote = TRUE; + } + } + else if (_is_utf8_quotemeta(s)) { + to_quote = TRUE; + } + + if (to_quote) { + *d++ = '\\'; + } if (ulen > len) ulen = len; len -= ulen; while (ulen--) *d++ = *s++; - } - else { - if (!isALNUM(*s)) - *d++ = '\\'; - *d++ = *s++; - len--; - } } SvUTF8_on(TARG); } + else if (IN_UNI_8_BIT) { + while (len--) { + if (_isQUOTEMETA(*s)) + *d++ = '\\'; + *d++ = *s++; + } + } else { + /* For non UNI_8_BIT (and hence in locale) just quote all \W + * including everything above ASCII */ while (len--) { if (!isWORDCHAR_A(*s)) *d++ = '\\'; |