diff options
author | David Mitchell <davem@iabyn.com> | 2017-12-26 16:40:14 +0000 |
---|---|---|
committer | David Mitchell <davem@iabyn.com> | 2018-01-19 11:24:54 +0000 |
commit | 334c6444f99a7a99b8470da2c82ad066563f162a (patch) | |
tree | 19ca628b47c918af4a2499622e1913e6261ef2d7 | |
parent | 840d136c2c4af2a91d93c448456a52a1dda730b2 (diff) | |
download | perl-334c6444f99a7a99b8470da2c82ad066563f162a.tar.gz |
tr/// functions: add some basic code comments
For the various C functions which implement the compile-time and
run-time aspects of OP_TRANS, add some basic code comments at the top of
each function explaining what its purpose is.
Also add lots of code comments to the body of S_pmtrans() (which compiles
a tr///).
Also comment what the OPpTRANS_ private flag bits mean.
No functional changes.
-rw-r--r-- | doop.c | 63 | ||||
-rw-r--r-- | op.c | 86 | ||||
-rw-r--r-- | regen/op_private | 13 |
3 files changed, 156 insertions, 6 deletions
@@ -27,6 +27,14 @@ #include <signal.h> #endif + +/* Helper function for do_trans(). + * Handles non-utf8 cases(*) not involving the /c, /d, /s flags, + * and where search and replacement charlists aren't identical. + * (*) i.e. where the search and replacement charlists are non-utf8. sv may + * or may not be utf8. + */ + STATIC I32 S_do_trans_simple(pTHX_ SV * const sv) { @@ -95,6 +103,17 @@ S_do_trans_simple(pTHX_ SV * const sv) return matches; } + +/* Helper function for do_trans(). + * Handles non-utf8 cases(*) where search and replacement charlists are + * identical: so the string isn't modified, and only a count of modifiable + * chars is needed. + * Note that it doesn't handle /d or /s, since these modify the string + * even if the replacement list is empty. + * (*) i.e. where the search and replacement charlists are non-utf8. sv may + * or may not be utf8. + */ + STATIC I32 S_do_trans_count(pTHX_ SV * const sv) { @@ -132,6 +151,14 @@ S_do_trans_count(pTHX_ SV * const sv) return matches; } + +/* Helper function for do_trans(). + * Handles non-utf8 cases(*) involving the /c, /d, /s flags, + * and where search and replacement charlists aren't identical. + * (*) i.e. where the search and replacement charlists are non-utf8. sv may + * or may not be utf8. + */ + STATIC I32 S_do_trans_complex(pTHX_ SV * const sv) { @@ -214,6 +241,7 @@ S_do_trans_complex(pTHX_ SV * const sv) d += len; } else { + /* use the implicit 0x100..0x7fffffff search range */ matches++; if (!del) { ch = (rlen == 0) ? (I32)comp : @@ -259,6 +287,7 @@ S_do_trans_complex(pTHX_ SV * const sv) d += len; } else { + /* use the implicit 0x100..0x7fffffff search range */ matches++; if (!del) { if (comp - 0x100 < rlen) @@ -295,6 +324,14 @@ S_do_trans_complex(pTHX_ SV * const sv) return matches; } + +/* Helper function for do_trans(). + * Handles utf8 cases(*) not involving the /c, /d, /s flags, + * and where search and replacement charlists aren't identical. + * (*) i.e. where the search or replacement charlists are utf8. sv may + * or may not be utf8. + */ + STATIC I32 S_do_trans_simple_utf8(pTHX_ SV * const sv) { @@ -393,6 +430,17 @@ S_do_trans_simple_utf8(pTHX_ SV * const sv) return matches; } + +/* Helper function for do_trans(). + * Handles utf8 cases(*) where search and replacement charlists are + * identical: so the string isn't modified, and only a count of modifiable + * chars is needed. + * Note that it doesn't handle /d or /s, since these modify the string + * even if the replacement charlist is empty. + * (*) i.e. where the search or replacement charlists are utf8. sv may + * or may not be utf8. + */ + STATIC I32 S_do_trans_count_utf8(pTHX_ SV * const sv) { @@ -436,6 +484,14 @@ S_do_trans_count_utf8(pTHX_ SV * const sv) return matches; } + +/* Helper function for do_trans(). + * Handles utf8 cases(*) involving the /c, /d, /s flags, + * and where search and replacement charlists aren't identical. + * (*) i.e. where the search or replacement charlists are utf8. sv may + * or may not be utf8. + */ + STATIC I32 S_do_trans_complex_utf8(pTHX_ SV * const sv) { @@ -597,6 +653,13 @@ S_do_trans_complex_utf8(pTHX_ SV * const sv) return matches; } + +/* Execute a tr//. sv is the value to be translated, while PL_op + * should be an OP_TRANS or OP_TRANSR op, whose op_pv field contains a + * translation table or whose op_sv field contains a swash. + * Returns a count of number of characters translated + */ + I32 Perl_do_trans(pTHX_ SV *sv) { @@ -6294,6 +6294,10 @@ Perl_newBINOP(pTHX_ I32 type, I32 flags, OP *first, OP *last) return fold_constants(op_integerize(op_std_init((OP *)binop))); } +/* Helper function for S_pmtrans(): comparison function to sort an array + * of codepoint range pairs. Sorts by start point, or if equal, by end + * point */ + static int uvcompare(const void *a, const void *b) __attribute__nonnull__(1) __attribute__nonnull__(2) @@ -6311,6 +6315,22 @@ static int uvcompare(const void *a, const void *b) return 0; } +/* Given an OP_TRANS / OP_TRANSR op o, plus OP_CONST ops expr and repl + * containing the search and replacement strings, assemble into + * a translation table attached as o->op_pv. + * Free expr and repl. + * It expects the toker to have already set the + * OPpTRANS_COMPLEMENT + * OPpTRANS_SQUASH + * OPpTRANS_DELETE + * flags as appropriate; this function may add + * OPpTRANS_FROM_UTF + * OPpTRANS_TO_UTF + * OPpTRANS_IDENTICAL + * OPpTRANS_GROWS + * flags + */ + static OP * S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) { @@ -6342,6 +6362,14 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) o->op_private |= OPpTRANS_TO_UTF; if (o->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF)) { + + /* for utf8 translations, op_sv will be set to point to a swash + * containing codepoint ranges. This is done by first assembling + * a textual representation of the ranges in listsv then compiling + * it using swash_init(). For more details of the textual format, + * see L<perlunicode.pod/"User-Defined Character Properties"> . + */ + SV* const listsv = newSVpvs("# comment\n"); SV* transv = NULL; const U8* tend = t + tlen; @@ -6383,15 +6411,24 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) * odd. */ if (complement) { + /* utf8 and /c: + * replace t/tlen/tend with a version that has the ranges + * complemented + */ U8 tmpbuf[UTF8_MAXBYTES+1]; UV *cp; UV nextmin = 0; Newx(cp, 2*tlen, UV); i = 0; transv = newSVpvs(""); + + /* convert search string into array of (start,end) range + * codepoint pairs stored in cp[]. Most "ranges" will start + * and end at the same char */ while (t < tend) { cp[2*i] = utf8n_to_uvchr(t, tend-t, &ulen, flags); t += ulen; + /* the toker converts X-Y into (X, ILLEGAL_UTF8_BYTE, Y) */ if (t < tend && *t == ILLEGAL_UTF8_BYTE) { t++; cp[2*i+1] = utf8n_to_uvchr(t, tend-t, &ulen, flags); @@ -6402,7 +6439,19 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) } i++; } + + /* sort the ranges */ qsort(cp, i, 2*sizeof(UV), uvcompare); + + /* Create a utf8 string containing the complement of the + * codepoint ranges. For example if cp[] contains [A,B], [C,D], + * then transv will contain the equivalent of: + * join '', map chr, 0, ILLEGAL_UTF8_BYTE, A - 1, + * B + 1, ILLEGAL_UTF8_BYTE, C - 1, + * D + 1, ILLEGAL_UTF8_BYTE, 0x7fffffff; + * A range of a single char skips the ILLEGAL_UTF8_BYTE and + * end cp. + */ for (j = 0; j < i; j++) { UV val = cp[2*j]; diff = val - nextmin; @@ -6420,6 +6469,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) if (val >= nextmin) nextmin = val + 1; } + t = uvchr_to_utf8(tmpbuf,nextmin); sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf); { @@ -6436,6 +6486,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) else if (!rlen && !del) { r = t; rlen = tlen; rend = tend; } + if (!squash) { if ((!rlen && !del) || t == r || (tlen == rlen && memEQ((char *)t, (char *)r, tlen))) @@ -6444,6 +6495,8 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) } } + /* extract char ranges from t and r and append them to listsv */ + while (t < tend || tfirst <= tlast) { /* see if we need more "t" chars */ if (tfirst > tlast) { @@ -6516,6 +6569,8 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) tfirst += diff + 1; } + /* compile listsv into a swash and attach to o */ + none = ++max; if (del) del = ++max; @@ -6557,12 +6612,36 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) goto warnins; } + /* Non-utf8 case: set o->op_pv to point to a simple 256-entry lookup + * table. Entries with the value -1 indicate chars not to be + * translated, while -2 indicates a search char without a + * corresponding replacement char under /d. + * + * With /c, an extra length arg is stored at the end of the table to + * indicate the number of chars in the replacement string, plus any + * excess replacement chars not paired with search chars. The extra + * chars are needed for utf8 strings. For example, + * tr/\x00-\xfd/abcd/c is logically equivalent to + * tr/\xfe\xff\x{100}\x{101}.../abcdddd.../, so the c,d chars need to + * be kept even though they aren't paired with any chars in the table + * (which represents chars \x00-\xff). Even without excess chars, the + * last replacement char needs to be kept. + * + * The toker will have already expanded char ranges in t and r. + */ + tbl = (short*)PerlMemShared_calloc( + /* one slot for 'extra len' count and one slot + * for possible storing of last replacement char */ (o->op_private & OPpTRANS_COMPLEMENT) && !(o->op_private & OPpTRANS_DELETE) ? 258 : 256, sizeof(short)); cPVOPo->op_pv = (char*)tbl; + if (complement) { + /* in this branch, j is a count of 'consumed' (i.e. paired off + * with a search char) replacement chars (so j <= rlen always) + */ for (i = 0; i < (I32)tlen; i++) tbl[t[i]] = -1; for (i = 0, j = 0; i < 256; i++) { @@ -6584,13 +6663,16 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) } if (!del) { if (!rlen) { + /* empty replacement list */ j = rlen; if (!squash) o->op_private |= OPpTRANS_IDENTICAL; } else if (j >= (I32)rlen) + /* no more replacement chars than search chars */ j = rlen - 1; else { + /* more replacement chars than search chars */ tbl = (short *) PerlMemShared_realloc(tbl, @@ -6598,6 +6680,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) cPVOPo->op_pv = (char*)tbl; } tbl[0x100] = (short)(rlen - j); + /* store any excess replacement chars at end of main table */ for (i=0; i < (I32)rlen - j; i++) tbl[0x101+i] = r[j+i]; } @@ -6631,6 +6714,8 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) } } + /* both non-utf8 and utf8 code paths end up here */ + warnins: if(del && rlen == tlen) { Perl_ck_warner(aTHX_ packWARN(WARN_MISC), "Useless use of /d modifier in transliteration operator"); @@ -6646,6 +6731,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) return o; } + /* =for apidoc Am|OP *|newPMOP|I32 type|I32 flags diff --git a/regen/op_private b/regen/op_private index eb53edf43a..49cb4bc035 100644 --- a/regen/op_private +++ b/regen/op_private @@ -513,14 +513,15 @@ addbits('sassign', for (qw(trans transr)) { addbits($_, - 0 => qw(OPpTRANS_FROM_UTF <UTF), - 1 => qw(OPpTRANS_TO_UTF >UTF), + 0 => qw(OPpTRANS_FROM_UTF <UTF), # search chars are utf8 + 1 => qw(OPpTRANS_TO_UTF >UTF), # replacement chars are utf8 2 => qw(OPpTRANS_IDENTICAL IDENT), # right side is same as left - 3 => qw(OPpTRANS_SQUASH SQUASH), + 3 => qw(OPpTRANS_SQUASH SQUASH), # /s # 4 is used for OPpTARGET_MY - 5 => qw(OPpTRANS_COMPLEMENT COMPL), - 6 => qw(OPpTRANS_GROWS GROWS), - 7 => qw(OPpTRANS_DELETE DEL), + 5 => qw(OPpTRANS_COMPLEMENT COMPL), # /c + 6 => qw(OPpTRANS_GROWS GROWS), # replacement chars longer than + # src chars + 7 => qw(OPpTRANS_DELETE DEL), # /d ); } |