summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Mitchell <davem@iabyn.com>2017-12-26 16:40:14 +0000
committerDavid Mitchell <davem@iabyn.com>2018-01-19 11:24:54 +0000
commit334c6444f99a7a99b8470da2c82ad066563f162a (patch)
tree19ca628b47c918af4a2499622e1913e6261ef2d7
parent840d136c2c4af2a91d93c448456a52a1dda730b2 (diff)
downloadperl-334c6444f99a7a99b8470da2c82ad066563f162a.tar.gz
tr/// functions: add some basic code comments
For the various C functions which implement the compile-time and run-time aspects of OP_TRANS, add some basic code comments at the top of each function explaining what its purpose is. Also add lots of code comments to the body of S_pmtrans() (which compiles a tr///). Also comment what the OPpTRANS_ private flag bits mean. No functional changes.
-rw-r--r--doop.c63
-rw-r--r--op.c86
-rw-r--r--regen/op_private13
3 files changed, 156 insertions, 6 deletions
diff --git a/doop.c b/doop.c
index 22942818f1..6dcd05cce4 100644
--- a/doop.c
+++ b/doop.c
@@ -27,6 +27,14 @@
#include <signal.h>
#endif
+
+/* Helper function for do_trans().
+ * Handles non-utf8 cases(*) not involving the /c, /d, /s flags,
+ * and where search and replacement charlists aren't identical.
+ * (*) i.e. where the search and replacement charlists are non-utf8. sv may
+ * or may not be utf8.
+ */
+
STATIC I32
S_do_trans_simple(pTHX_ SV * const sv)
{
@@ -95,6 +103,17 @@ S_do_trans_simple(pTHX_ SV * const sv)
return matches;
}
+
+/* Helper function for do_trans().
+ * Handles non-utf8 cases(*) where search and replacement charlists are
+ * identical: so the string isn't modified, and only a count of modifiable
+ * chars is needed.
+ * Note that it doesn't handle /d or /s, since these modify the string
+ * even if the replacement list is empty.
+ * (*) i.e. where the search and replacement charlists are non-utf8. sv may
+ * or may not be utf8.
+ */
+
STATIC I32
S_do_trans_count(pTHX_ SV * const sv)
{
@@ -132,6 +151,14 @@ S_do_trans_count(pTHX_ SV * const sv)
return matches;
}
+
+/* Helper function for do_trans().
+ * Handles non-utf8 cases(*) involving the /c, /d, /s flags,
+ * and where search and replacement charlists aren't identical.
+ * (*) i.e. where the search and replacement charlists are non-utf8. sv may
+ * or may not be utf8.
+ */
+
STATIC I32
S_do_trans_complex(pTHX_ SV * const sv)
{
@@ -214,6 +241,7 @@ S_do_trans_complex(pTHX_ SV * const sv)
d += len;
}
else {
+ /* use the implicit 0x100..0x7fffffff search range */
matches++;
if (!del) {
ch = (rlen == 0) ? (I32)comp :
@@ -259,6 +287,7 @@ S_do_trans_complex(pTHX_ SV * const sv)
d += len;
}
else {
+ /* use the implicit 0x100..0x7fffffff search range */
matches++;
if (!del) {
if (comp - 0x100 < rlen)
@@ -295,6 +324,14 @@ S_do_trans_complex(pTHX_ SV * const sv)
return matches;
}
+
+/* Helper function for do_trans().
+ * Handles utf8 cases(*) not involving the /c, /d, /s flags,
+ * and where search and replacement charlists aren't identical.
+ * (*) i.e. where the search or replacement charlists are utf8. sv may
+ * or may not be utf8.
+ */
+
STATIC I32
S_do_trans_simple_utf8(pTHX_ SV * const sv)
{
@@ -393,6 +430,17 @@ S_do_trans_simple_utf8(pTHX_ SV * const sv)
return matches;
}
+
+/* Helper function for do_trans().
+ * Handles utf8 cases(*) where search and replacement charlists are
+ * identical: so the string isn't modified, and only a count of modifiable
+ * chars is needed.
+ * Note that it doesn't handle /d or /s, since these modify the string
+ * even if the replacement charlist is empty.
+ * (*) i.e. where the search or replacement charlists are utf8. sv may
+ * or may not be utf8.
+ */
+
STATIC I32
S_do_trans_count_utf8(pTHX_ SV * const sv)
{
@@ -436,6 +484,14 @@ S_do_trans_count_utf8(pTHX_ SV * const sv)
return matches;
}
+
+/* Helper function for do_trans().
+ * Handles utf8 cases(*) involving the /c, /d, /s flags,
+ * and where search and replacement charlists aren't identical.
+ * (*) i.e. where the search or replacement charlists are utf8. sv may
+ * or may not be utf8.
+ */
+
STATIC I32
S_do_trans_complex_utf8(pTHX_ SV * const sv)
{
@@ -597,6 +653,13 @@ S_do_trans_complex_utf8(pTHX_ SV * const sv)
return matches;
}
+
+/* Execute a tr//. sv is the value to be translated, while PL_op
+ * should be an OP_TRANS or OP_TRANSR op, whose op_pv field contains a
+ * translation table or whose op_sv field contains a swash.
+ * Returns a count of number of characters translated
+ */
+
I32
Perl_do_trans(pTHX_ SV *sv)
{
diff --git a/op.c b/op.c
index ace79ad14c..4980aeedf2 100644
--- a/op.c
+++ b/op.c
@@ -6294,6 +6294,10 @@ Perl_newBINOP(pTHX_ I32 type, I32 flags, OP *first, OP *last)
return fold_constants(op_integerize(op_std_init((OP *)binop)));
}
+/* Helper function for S_pmtrans(): comparison function to sort an array
+ * of codepoint range pairs. Sorts by start point, or if equal, by end
+ * point */
+
static int uvcompare(const void *a, const void *b)
__attribute__nonnull__(1)
__attribute__nonnull__(2)
@@ -6311,6 +6315,22 @@ static int uvcompare(const void *a, const void *b)
return 0;
}
+/* Given an OP_TRANS / OP_TRANSR op o, plus OP_CONST ops expr and repl
+ * containing the search and replacement strings, assemble into
+ * a translation table attached as o->op_pv.
+ * Free expr and repl.
+ * It expects the toker to have already set the
+ * OPpTRANS_COMPLEMENT
+ * OPpTRANS_SQUASH
+ * OPpTRANS_DELETE
+ * flags as appropriate; this function may add
+ * OPpTRANS_FROM_UTF
+ * OPpTRANS_TO_UTF
+ * OPpTRANS_IDENTICAL
+ * OPpTRANS_GROWS
+ * flags
+ */
+
static OP *
S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
{
@@ -6342,6 +6362,14 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
o->op_private |= OPpTRANS_TO_UTF;
if (o->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF)) {
+
+ /* for utf8 translations, op_sv will be set to point to a swash
+ * containing codepoint ranges. This is done by first assembling
+ * a textual representation of the ranges in listsv then compiling
+ * it using swash_init(). For more details of the textual format,
+ * see L<perlunicode.pod/"User-Defined Character Properties"> .
+ */
+
SV* const listsv = newSVpvs("# comment\n");
SV* transv = NULL;
const U8* tend = t + tlen;
@@ -6383,15 +6411,24 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
* odd. */
if (complement) {
+ /* utf8 and /c:
+ * replace t/tlen/tend with a version that has the ranges
+ * complemented
+ */
U8 tmpbuf[UTF8_MAXBYTES+1];
UV *cp;
UV nextmin = 0;
Newx(cp, 2*tlen, UV);
i = 0;
transv = newSVpvs("");
+
+ /* convert search string into array of (start,end) range
+ * codepoint pairs stored in cp[]. Most "ranges" will start
+ * and end at the same char */
while (t < tend) {
cp[2*i] = utf8n_to_uvchr(t, tend-t, &ulen, flags);
t += ulen;
+ /* the toker converts X-Y into (X, ILLEGAL_UTF8_BYTE, Y) */
if (t < tend && *t == ILLEGAL_UTF8_BYTE) {
t++;
cp[2*i+1] = utf8n_to_uvchr(t, tend-t, &ulen, flags);
@@ -6402,7 +6439,19 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
}
i++;
}
+
+ /* sort the ranges */
qsort(cp, i, 2*sizeof(UV), uvcompare);
+
+ /* Create a utf8 string containing the complement of the
+ * codepoint ranges. For example if cp[] contains [A,B], [C,D],
+ * then transv will contain the equivalent of:
+ * join '', map chr, 0, ILLEGAL_UTF8_BYTE, A - 1,
+ * B + 1, ILLEGAL_UTF8_BYTE, C - 1,
+ * D + 1, ILLEGAL_UTF8_BYTE, 0x7fffffff;
+ * A range of a single char skips the ILLEGAL_UTF8_BYTE and
+ * end cp.
+ */
for (j = 0; j < i; j++) {
UV val = cp[2*j];
diff = val - nextmin;
@@ -6420,6 +6469,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
if (val >= nextmin)
nextmin = val + 1;
}
+
t = uvchr_to_utf8(tmpbuf,nextmin);
sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
{
@@ -6436,6 +6486,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
else if (!rlen && !del) {
r = t; rlen = tlen; rend = tend;
}
+
if (!squash) {
if ((!rlen && !del) || t == r ||
(tlen == rlen && memEQ((char *)t, (char *)r, tlen)))
@@ -6444,6 +6495,8 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
}
}
+ /* extract char ranges from t and r and append them to listsv */
+
while (t < tend || tfirst <= tlast) {
/* see if we need more "t" chars */
if (tfirst > tlast) {
@@ -6516,6 +6569,8 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
tfirst += diff + 1;
}
+ /* compile listsv into a swash and attach to o */
+
none = ++max;
if (del)
del = ++max;
@@ -6557,12 +6612,36 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
goto warnins;
}
+ /* Non-utf8 case: set o->op_pv to point to a simple 256-entry lookup
+ * table. Entries with the value -1 indicate chars not to be
+ * translated, while -2 indicates a search char without a
+ * corresponding replacement char under /d.
+ *
+ * With /c, an extra length arg is stored at the end of the table to
+ * indicate the number of chars in the replacement string, plus any
+ * excess replacement chars not paired with search chars. The extra
+ * chars are needed for utf8 strings. For example,
+ * tr/\x00-\xfd/abcd/c is logically equivalent to
+ * tr/\xfe\xff\x{100}\x{101}.../abcdddd.../, so the c,d chars need to
+ * be kept even though they aren't paired with any chars in the table
+ * (which represents chars \x00-\xff). Even without excess chars, the
+ * last replacement char needs to be kept.
+ *
+ * The toker will have already expanded char ranges in t and r.
+ */
+
tbl = (short*)PerlMemShared_calloc(
+ /* one slot for 'extra len' count and one slot
+ * for possible storing of last replacement char */
(o->op_private & OPpTRANS_COMPLEMENT) &&
!(o->op_private & OPpTRANS_DELETE) ? 258 : 256,
sizeof(short));
cPVOPo->op_pv = (char*)tbl;
+
if (complement) {
+ /* in this branch, j is a count of 'consumed' (i.e. paired off
+ * with a search char) replacement chars (so j <= rlen always)
+ */
for (i = 0; i < (I32)tlen; i++)
tbl[t[i]] = -1;
for (i = 0, j = 0; i < 256; i++) {
@@ -6584,13 +6663,16 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
}
if (!del) {
if (!rlen) {
+ /* empty replacement list */
j = rlen;
if (!squash)
o->op_private |= OPpTRANS_IDENTICAL;
}
else if (j >= (I32)rlen)
+ /* no more replacement chars than search chars */
j = rlen - 1;
else {
+ /* more replacement chars than search chars */
tbl =
(short *)
PerlMemShared_realloc(tbl,
@@ -6598,6 +6680,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
cPVOPo->op_pv = (char*)tbl;
}
tbl[0x100] = (short)(rlen - j);
+ /* store any excess replacement chars at end of main table */
for (i=0; i < (I32)rlen - j; i++)
tbl[0x101+i] = r[j+i];
}
@@ -6631,6 +6714,8 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
}
}
+ /* both non-utf8 and utf8 code paths end up here */
+
warnins:
if(del && rlen == tlen) {
Perl_ck_warner(aTHX_ packWARN(WARN_MISC), "Useless use of /d modifier in transliteration operator");
@@ -6646,6 +6731,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
return o;
}
+
/*
=for apidoc Am|OP *|newPMOP|I32 type|I32 flags
diff --git a/regen/op_private b/regen/op_private
index eb53edf43a..49cb4bc035 100644
--- a/regen/op_private
+++ b/regen/op_private
@@ -513,14 +513,15 @@ addbits('sassign',
for (qw(trans transr)) {
addbits($_,
- 0 => qw(OPpTRANS_FROM_UTF <UTF),
- 1 => qw(OPpTRANS_TO_UTF >UTF),
+ 0 => qw(OPpTRANS_FROM_UTF <UTF), # search chars are utf8
+ 1 => qw(OPpTRANS_TO_UTF >UTF), # replacement chars are utf8
2 => qw(OPpTRANS_IDENTICAL IDENT), # right side is same as left
- 3 => qw(OPpTRANS_SQUASH SQUASH),
+ 3 => qw(OPpTRANS_SQUASH SQUASH), # /s
# 4 is used for OPpTARGET_MY
- 5 => qw(OPpTRANS_COMPLEMENT COMPL),
- 6 => qw(OPpTRANS_GROWS GROWS),
- 7 => qw(OPpTRANS_DELETE DEL),
+ 5 => qw(OPpTRANS_COMPLEMENT COMPL), # /c
+ 6 => qw(OPpTRANS_GROWS GROWS), # replacement chars longer than
+ # src chars
+ 7 => qw(OPpTRANS_DELETE DEL), # /d
);
}