diff options
author | Nicholas Clark <nick@ccl4.org> | 2021-05-13 10:15:28 +0000 |
---|---|---|
committer | Nicholas Clark <nick@ccl4.org> | 2021-05-22 08:22:26 +0000 |
commit | d30553598b9a36dc7e48770337611be88403d643 (patch) | |
tree | bf54e83123f7674a886ff1623ed05f480fe0ba25 /dist/Data-Dumper | |
parent | 22d88af0e3e04dca72dcd494a6298431e57aeae0 (diff) | |
download | perl-d30553598b9a36dc7e48770337611be88403d643.tar.gz |
Rework Data::Dumper Unicode-in-qr support.
This approach (and this commit message) are based on Aaron Crane's original
in GH #18771. However, we leave the pure-Perl Dump unchanged (which means
changing the tests somewhat), and need to handle one more corner case
(\x{...} escaping a Unicode character that follows a backslash).
The previous approach was to upgrade the output to the internal UTF-8
encoding when dumping a regex containing supra-Latin-1 characters. That
has the disadvantage that nothing else generates wide characters in the
output, or even knows that the output might be upgraded.
A better approach, and one that's more consistent with the one taken for
string literals, is to use `\x{…}` notation where needed.
Closes #18764
Diffstat (limited to 'dist/Data-Dumper')
-rw-r--r-- | dist/Data-Dumper/Dumper.xs | 64 | ||||
-rw-r--r-- | dist/Data-Dumper/t/dumper.t | 6 |
2 files changed, 49 insertions, 21 deletions
diff --git a/dist/Data-Dumper/Dumper.xs b/dist/Data-Dumper/Dumper.xs index fb86a21b70..9fb1183644 100644 --- a/dist/Data-Dumper/Dumper.xs +++ b/dist/Data-Dumper/Dumper.xs @@ -622,9 +622,10 @@ dump_regexp(pTHX_ SV *retval, SV *val) SV *sv_pattern = NULL; SV *sv_flags = NULL; const char *rval; - const char *rend; - const char *slash; + const U8 *rend; + U8 *p; CV *re_pattern_cv = get_cv("re::regexp_pattern", 0); + int do_utf8; if (!re_pattern_cv) { sv_pattern = val; @@ -656,6 +657,8 @@ dump_regexp(pTHX_ SV *retval, SV *val) assert(sv_pattern); + sv_catpvs(retval, "qr/"); + /* The strategy here is from commit 7894fbab1e479c2c (in June 1999) with a * bug fix in Feb 2012 (commit de5ef703c7d8db65). * We need to ensure that / is escaped as \/ @@ -670,27 +673,58 @@ dump_regexp(pTHX_ SV *retval, SV *val) * \ and the character immediately after (together) * a character * and only for the latter, do we need to escape / + * + * Of course, to add to the fun, we also need to escape Unicode characters + * to \x{...} notation (whether they are "escaped" by \ or stand alone). + * We can do all this in one pass if we are careful... */ rval = SvPV(sv_pattern, rlen); - rend = rval+rlen; - slash = rval; - sv_catpvs(retval, "qr/"); + p = (U8 *)rval; + rend = p + rlen; + do_utf8 = DO_UTF8(sv_pattern); + + while (p < rend) { + UV k = *p; + int saw_backslash = k == '\\'; + + if (saw_backslash) { + if (++p == rend) { + /* Oh my, \ at the end. Is this possible? */ + break; + } + /* Otherwise we look at the next octet */ + k = *p; + } - for ( ; slash < rend; slash++) { - if (*slash == '\\') { - ++slash; - continue; + if ((k == '/' && !saw_backslash) || (do_utf8 && ! isASCII(k) && k > ' ')) { + STRLEN to_copy = p - (U8 *) rval; + if (to_copy) { + /* If saw_backslash is true, this will copy the \ for us too. */ + sv_catpvn(retval, rval, to_copy); + } + if (k == '/') { + sv_catpvs(retval, "\\/"); + ++p; + } + else { + /* If there was a \, we have copied it already, so all that is + * left to do here is the \x{...} escaping. */ + k = utf8_to_uvchr_buf(p, rend, NULL); + sv_catpvf(retval, "\\x{%" UVxf "}", k); + p += UTF8SKIP(p); + } + rval = (const char *) p; } - if (*slash == '/') { - sv_catpvn(retval, rval, slash-rval); - sv_catpvs(retval, "\\/"); - rlen -= slash-rval+1; - rval = slash+1; + else { + ++p; } } - sv_catpvn(retval, rval, rlen); + rlen = rend - (U8 *) rval; + if (rlen) { + sv_catpvn(retval, rval, rlen); + } sv_catpvs(retval, "/"); if (sv_flags) diff --git a/dist/Data-Dumper/t/dumper.t b/dist/Data-Dumper/t/dumper.t index af8c10308a..f05449e849 100644 --- a/dist/Data-Dumper/t/dumper.t +++ b/dist/Data-Dumper/t/dumper.t @@ -1734,9 +1734,6 @@ EOW TEST qq(Data::Dumper->Dump([ [qq/\x{41f}/, qr/\x{8b80}/, qr/\x{41f}/, qr/\x{e4}/, "\xE4"] ])), "string with Unicode + regexp with Unicode"; - SKIP_TEST "skipped, pending fix for github #18764"; - last; - $WANT =~ s/'\xE4'/"\\x{e4}"/; $WANT =~ s<([^\0-\177])> <sprintf '\\x{%x}', ord $1>ge; TEST qq(Data::Dumper->Dumpxs([ [qq/\x{41f}/, qr/\x{8b80}/, qr/\x{41f}/, qr/\x{e4}/, "\xE4"] ])), @@ -1797,9 +1794,6 @@ EOW TEST qq(Data::Dumper->Dump([ [ '\x{2e18}', qr! \x{203d}/ !, qr! \\\x{203d}/ !, qr! \\\x{203d}$bs:/ !, "\xa3"] ])), "github #18614, github #18764, perl #58608 corner cases"; - SKIP_TEST "skipped, pending fix for github #18764"; - last; - $WANT =~ s/'\x{A3}'/"\\x{a3}"/; $WANT =~ s/\x{203D}/\\x{203d}/g; TEST qq(Data::Dumper->Dumpxs([ [ '\x{2e18}', qr! \x{203d}/ !, qr! \\\x{203d}/ !, qr! \\\x{203d}$bs:/ !, "\xa3"] ])), |