summaryrefslogtreecommitdiff
path: root/dist/Data-Dumper
diff options
context:
space:
mode:
authorNicholas Clark <nick@ccl4.org>2021-05-13 10:15:28 +0000
committerNicholas Clark <nick@ccl4.org>2021-05-22 08:22:26 +0000
commitd30553598b9a36dc7e48770337611be88403d643 (patch)
treebf54e83123f7674a886ff1623ed05f480fe0ba25 /dist/Data-Dumper
parent22d88af0e3e04dca72dcd494a6298431e57aeae0 (diff)
downloadperl-d30553598b9a36dc7e48770337611be88403d643.tar.gz
Rework Data::Dumper Unicode-in-qr support.
This approach (and this commit message) are based on Aaron Crane's original in GH #18771. However, we leave the pure-Perl Dump unchanged (which means changing the tests somewhat), and need to handle one more corner case (\x{...} escaping a Unicode character that follows a backslash). The previous approach was to upgrade the output to the internal UTF-8 encoding when dumping a regex containing supra-Latin-1 characters. That has the disadvantage that nothing else generates wide characters in the output, or even knows that the output might be upgraded. A better approach, and one that's more consistent with the one taken for string literals, is to use `\x{…}` notation where needed. Closes #18764
Diffstat (limited to 'dist/Data-Dumper')
-rw-r--r--dist/Data-Dumper/Dumper.xs64
-rw-r--r--dist/Data-Dumper/t/dumper.t6
2 files changed, 49 insertions, 21 deletions
diff --git a/dist/Data-Dumper/Dumper.xs b/dist/Data-Dumper/Dumper.xs
index fb86a21b70..9fb1183644 100644
--- a/dist/Data-Dumper/Dumper.xs
+++ b/dist/Data-Dumper/Dumper.xs
@@ -622,9 +622,10 @@ dump_regexp(pTHX_ SV *retval, SV *val)
SV *sv_pattern = NULL;
SV *sv_flags = NULL;
const char *rval;
- const char *rend;
- const char *slash;
+ const U8 *rend;
+ U8 *p;
CV *re_pattern_cv = get_cv("re::regexp_pattern", 0);
+ int do_utf8;
if (!re_pattern_cv) {
sv_pattern = val;
@@ -656,6 +657,8 @@ dump_regexp(pTHX_ SV *retval, SV *val)
assert(sv_pattern);
+ sv_catpvs(retval, "qr/");
+
/* The strategy here is from commit 7894fbab1e479c2c (in June 1999) with a
* bug fix in Feb 2012 (commit de5ef703c7d8db65).
* We need to ensure that / is escaped as \/
@@ -670,27 +673,58 @@ dump_regexp(pTHX_ SV *retval, SV *val)
* \ and the character immediately after (together)
* a character
* and only for the latter, do we need to escape /
+ *
+ * Of course, to add to the fun, we also need to escape Unicode characters
+ * to \x{...} notation (whether they are "escaped" by \ or stand alone).
+ * We can do all this in one pass if we are careful...
*/
rval = SvPV(sv_pattern, rlen);
- rend = rval+rlen;
- slash = rval;
- sv_catpvs(retval, "qr/");
+ p = (U8 *)rval;
+ rend = p + rlen;
+ do_utf8 = DO_UTF8(sv_pattern);
+
+ while (p < rend) {
+ UV k = *p;
+ int saw_backslash = k == '\\';
+
+ if (saw_backslash) {
+ if (++p == rend) {
+ /* Oh my, \ at the end. Is this possible? */
+ break;
+ }
+ /* Otherwise we look at the next octet */
+ k = *p;
+ }
- for ( ; slash < rend; slash++) {
- if (*slash == '\\') {
- ++slash;
- continue;
+ if ((k == '/' && !saw_backslash) || (do_utf8 && ! isASCII(k) && k > ' ')) {
+ STRLEN to_copy = p - (U8 *) rval;
+ if (to_copy) {
+ /* If saw_backslash is true, this will copy the \ for us too. */
+ sv_catpvn(retval, rval, to_copy);
+ }
+ if (k == '/') {
+ sv_catpvs(retval, "\\/");
+ ++p;
+ }
+ else {
+ /* If there was a \, we have copied it already, so all that is
+ * left to do here is the \x{...} escaping. */
+ k = utf8_to_uvchr_buf(p, rend, NULL);
+ sv_catpvf(retval, "\\x{%" UVxf "}", k);
+ p += UTF8SKIP(p);
+ }
+ rval = (const char *) p;
}
- if (*slash == '/') {
- sv_catpvn(retval, rval, slash-rval);
- sv_catpvs(retval, "\\/");
- rlen -= slash-rval+1;
- rval = slash+1;
+ else {
+ ++p;
}
}
- sv_catpvn(retval, rval, rlen);
+ rlen = rend - (U8 *) rval;
+ if (rlen) {
+ sv_catpvn(retval, rval, rlen);
+ }
sv_catpvs(retval, "/");
if (sv_flags)
diff --git a/dist/Data-Dumper/t/dumper.t b/dist/Data-Dumper/t/dumper.t
index af8c10308a..f05449e849 100644
--- a/dist/Data-Dumper/t/dumper.t
+++ b/dist/Data-Dumper/t/dumper.t
@@ -1734,9 +1734,6 @@ EOW
TEST qq(Data::Dumper->Dump([ [qq/\x{41f}/, qr/\x{8b80}/, qr/\x{41f}/, qr/\x{e4}/, "\xE4"] ])),
"string with Unicode + regexp with Unicode";
- SKIP_TEST "skipped, pending fix for github #18764";
- last;
-
$WANT =~ s/'\xE4'/"\\x{e4}"/;
$WANT =~ s<([^\0-\177])> <sprintf '\\x{%x}', ord $1>ge;
TEST qq(Data::Dumper->Dumpxs([ [qq/\x{41f}/, qr/\x{8b80}/, qr/\x{41f}/, qr/\x{e4}/, "\xE4"] ])),
@@ -1797,9 +1794,6 @@ EOW
TEST qq(Data::Dumper->Dump([ [ '\x{2e18}', qr! \x{203d}/ !, qr! \\\x{203d}/ !, qr! \\\x{203d}$bs:/ !, "\xa3"] ])),
"github #18614, github #18764, perl #58608 corner cases";
- SKIP_TEST "skipped, pending fix for github #18764";
- last;
-
$WANT =~ s/'\x{A3}'/"\\x{a3}"/;
$WANT =~ s/\x{203D}/\\x{203d}/g;
TEST qq(Data::Dumper->Dumpxs([ [ '\x{2e18}', qr! \x{203d}/ !, qr! \\\x{203d}/ !, qr! \\\x{203d}$bs:/ !, "\xa3"] ])),