summaryrefslogtreecommitdiff
path: root/pp.c
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-02-15 11:31:27 -0700
committerKarl Williamson <public@khwilliamson.com>2012-02-15 18:02:35 -0700
commit2e2b25717dbde8d9ce48b4b8dc443e1d08166347 (patch)
treeca10f48aa5a2fa0549aebebed4109a9d8c59aa24 /pp.c
parentadfec83175578461303ab5cfcc90d37cb3114126 (diff)
downloadperl-2e2b25717dbde8d9ce48b4b8dc443e1d08166347.tar.gz
perl #77654: quotemeta quotes non-ASCII consistently
As described in the pod changes in this commit, this changes quotemeta() to consistenly quote non-ASCII characters when used under unicode_strings. The behavior is changed for these and UTF-8 encoded strings to more closely align with Unicode's recommendations. The end result is that we *could* at some future point start using other characters as metacharacters than the 12 we do now.
Diffstat (limited to 'pp.c')
-rw-r--r--pp.c37
1 files changed, 29 insertions, 8 deletions
diff --git a/pp.c b/pp.c
index b12772c2de..93e59fa09f 100644
--- a/pp.c
+++ b/pp.c
@@ -4088,24 +4088,45 @@ PP(pp_quotemeta)
d = SvPVX(TARG);
if (DO_UTF8(sv)) {
while (len) {
- if (UTF8_IS_CONTINUED(*s)) {
STRLEN ulen = UTF8SKIP(s);
+ bool to_quote = FALSE;
+
+ if (UTF8_IS_INVARIANT(*s)) {
+ if (_isQUOTEMETA(*s)) {
+ to_quote = TRUE;
+ }
+ }
+ else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
+ if (_isQUOTEMETA(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1))))
+ {
+ to_quote = TRUE;
+ }
+ }
+ else if (_is_utf8_quotemeta(s)) {
+ to_quote = TRUE;
+ }
+
+ if (to_quote) {
+ *d++ = '\\';
+ }
if (ulen > len)
ulen = len;
len -= ulen;
while (ulen--)
*d++ = *s++;
- }
- else {
- if (!isALNUM(*s))
- *d++ = '\\';
- *d++ = *s++;
- len--;
- }
}
SvUTF8_on(TARG);
}
+ else if (IN_UNI_8_BIT) {
+ while (len--) {
+ if (_isQUOTEMETA(*s))
+ *d++ = '\\';
+ *d++ = *s++;
+ }
+ }
else {
+ /* For non UNI_8_BIT (and hence in locale) just quote all \W
+ * including everything above ASCII */
while (len--) {
if (!isWORDCHAR_A(*s))
*d++ = '\\';