summaryrefslogtreecommitdiff
path: root/doop.c
diff options
context:
space:
mode:
authorDavid Mitchell <davem@iabyn.com>2018-01-11 11:45:49 +0000
committerDavid Mitchell <davem@iabyn.com>2018-01-19 13:45:19 +0000
commitf146bab680e5a22cf4657df42193385ddd2b538c (patch)
tree233f1c84d32d3195bf371fce30a72ceb7d44098a /doop.c
parentea088e559c9bca8e7337d3d6236f06deb5afda32 (diff)
downloadperl-f146bab680e5a22cf4657df42193385ddd2b538c.tar.gz
fix "\x{100}..." =~ tr/.../.../cd
In transliterations where the search and replacement charlists are non-utf8, but where the string being modified contains codepoints >= 0x100, then tr/.../.../cd would always delete all such codepoints, rather than potentially mapping some of them. In more detail: in the presence of /c (complement), an implicit 0x100..0x7fffffff is added to a non-utf8 search charlist. If the replacement list is longer than the < 0x100 part of the search list, then the last few replacement chars should in principle be paired off against the first few of (\x100, \x101, ...). However, this wasn't happening. For example, tr/\x00-\xfd/ABCD/cd should be equivalent to tr/\xfe-\x{7fffffff}/ABCD/d which should map: \xfe => A, \xff => B, \x{100} => C, \x{101} => D, and delete \x{102} onwards. But instead, it behaved like tr/\xfe-\x{7fffffff}/AB/d and deleted all codepoints >= 0x100. This commit fixes that by using the extended mapping table format for all /c variants (formerly it excluded /cd). I also changed a variable holding the mapped char from being I32 to UV: principally to avoid a casting mess in the fixed code. This may (or may not), as a side-effect, have fixed possible issues with very large codepoints.
Diffstat (limited to 'doop.c')
-rw-r--r--doop.c64
1 files changed, 39 insertions, 25 deletions
diff --git a/doop.c b/doop.c
index 7dc3fe6252..c7973ff985 100644
--- a/doop.c
+++ b/doop.c
@@ -224,7 +224,7 @@ S_do_trans_complex(pTHX_ SV * const sv)
else
d = s;
dstart = d;
- if (complement && !del)
+ if (complement)
/* number of replacement chars in excess of any 0x00..0xff
* search characters */
excess = (SSize_t)tbl[0x100];
@@ -235,7 +235,8 @@ S_do_trans_complex(pTHX_ SV * const sv)
STRLEN len;
const UV comp = utf8n_to_uvchr(s, send - s, &len,
UTF8_ALLOW_DEFAULT);
- I32 ch;
+ UV ch;
+ short sch;
if (comp > 0xff) {
if (!complement) {
@@ -245,34 +246,40 @@ S_do_trans_complex(pTHX_ SV * const sv)
else {
/* use the implicit 0x100..0x7fffffff search range */
matches++;
- if (!del) {
- ch = (excess == -1) ? (I32)comp :
- ( excess == 0
- || excess < (IV)comp - 0xff) ? tbl[0x101]
- : tbl[comp+2];
- if ((UV)ch != pch) {
+ ch = del
+ /* setting ch to pch forces char to be deleted */
+ ? ((excess >= (IV)comp - 0xff) ? (UV)tbl[comp+2]
+ : pch )
+
+ : ( (excess == -1) ? comp :
+ (UV)(( excess == 0
+ || excess < (IV)comp - 0xff) ? tbl[0x101]
+ : tbl[comp+2]
+ )
+ );
+ if (ch != pch) {
d = uvchr_to_utf8(d, ch);
- pch = (UV)ch;
+ pch = ch;
}
s += len;
continue;
- }
}
}
- else if ((ch = tbl[comp]) >= 0) {
+ else if ((sch = tbl[comp]) >= 0) {
+ ch = (UV)sch;
matches++;
- if ((UV)ch != pch) {
+ if (ch != pch) {
d = uvchr_to_utf8(d, ch);
- pch = (UV)ch;
+ pch = ch;
}
s += len;
continue;
}
- else if (ch == -1) { /* -1 is unmapped character */
+ else if (sch == -1) { /* -1 is unmapped character */
Move(s, d, len, U8);
d += len;
}
- else if (ch == -2) /* -2 is delete character */
+ else if (sch == -2) /* -2 is delete character */
matches++;
s += len;
pch = 0xfeedface;
@@ -283,7 +290,8 @@ S_do_trans_complex(pTHX_ SV * const sv)
STRLEN len;
const UV comp = utf8n_to_uvchr(s, send - s, &len,
UTF8_ALLOW_DEFAULT);
- I32 ch;
+ UV ch;
+ short sch;
if (comp > 0xff) {
if (!complement) {
Move(s, d, len, U8);
@@ -292,26 +300,32 @@ S_do_trans_complex(pTHX_ SV * const sv)
else {
/* use the implicit 0x100..0x7fffffff search range */
matches++;
- if (!del) {
+ if (del) {
+ if (excess >= (IV)comp - 0xff) {
+ ch = (UV)tbl[comp+2];
+ d = uvchr_to_utf8(d, ch);
+ }
+ }
+ else {
/* tr/...//c should call S_do_trans_count
* instead */
assert(excess != -1);
- ch = ( excess == 0
- || excess < (IV)comp - 0xff) ? tbl[0x101]
- : tbl[comp+2];
+ ch = (UV)( excess == 0
+ || excess < (IV)comp-0xff) ? tbl[0x101]
+ : tbl[comp+2];
d = uvchr_to_utf8(d, ch);
- }
+ }
}
}
- else if ((ch = tbl[comp]) >= 0) {
- d = uvchr_to_utf8(d, ch);
+ else if ((sch = tbl[comp]) >= 0) {
+ d = uvchr_to_utf8(d, (UV)sch);
matches++;
}
- else if (ch == -1) { /* -1 is unmapped character */
+ else if (sch == -1) { /* -1 is unmapped character */
Move(s, d, len, U8);
d += len;
}
- else if (ch == -2) /* -2 is delete character */
+ else if (sch == -2) /* -2 is delete character */
matches++;
s += len;
}