summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doop.c506
-rw-r--r--dump.c13
-rw-r--r--embed.fnc10
-rw-r--r--embed.h25
-rw-r--r--invlist_inline.h5
-rw-r--r--lib/B/Deparse.pm163
-rw-r--r--op.c1310
-rw-r--r--op.h2
-rw-r--r--proto.h119
-rw-r--r--toke.c22
10 files changed, 1341 insertions, 834 deletions
diff --git a/doop.c b/doop.c
index e0d63f13d1..3cb1354540 100644
--- a/doop.c
+++ b/doop.c
@@ -22,6 +22,7 @@
#include "EXTERN.h"
#define PERL_IN_DOOP_C
#include "perl.h"
+#include "invlist_inline.h"
#ifndef PERL_MICRO
#include <signal.h>
@@ -297,328 +298,240 @@ S_do_trans_complex(pTHX_ SV * const sv, const OPtrans_map * const tbl)
/* Helper function for do_trans().
- * Handles utf8 cases(*) not involving the /c, /d, /s flags,
- * and where search and replacement charlists aren't identical.
- * (*) i.e. where the search or replacement charlists are utf8. sv may
- * or may not be utf8.
+ * Handles cases where an inversion map implementation is to be used and the
+ * search and replacement charlists are identical: so the string isn't
+ * modified, and only a count of modifiable chars is needed.
+ *
+ * Note that it doesn't handle /d nor /s, since these modify the string
+ * even if the replacement charlist is empty.
+ *
+ * sv may or may not be utf8.
*/
STATIC Size_t
-S_do_trans_simple_utf8(pTHX_ SV * const sv)
+S_do_trans_count_invmap(pTHX_ SV * const sv, AV * const invmap)
{
U8 *s;
U8 *send;
- U8 *d;
- U8 *start;
- U8 *dstart, *dend;
Size_t matches = 0;
- const bool grows = cBOOL(PL_op->op_private & OPpTRANS_GROWS);
STRLEN len;
- SV* const rv =
-#ifdef USE_ITHREADS
- PAD_SVl(cPADOP->op_padix);
-#else
- MUTABLE_SV(cSVOP->op_sv);
-#endif
- HV* const hv = MUTABLE_HV(SvRV(rv));
- SV* const * svp = hv_fetchs(hv, "NONE", FALSE);
- const UV none = svp ? SvUV(*svp) : 0x7fffffff;
- const UV extra = none + 1;
- UV final = 0;
- U8 hibit = 0;
+ SV** const from_invlist_ptr = av_fetch(invmap, 0, TRUE);
+ SV** const to_invmap_ptr = av_fetch(invmap, 1, TRUE);
+ SV* from_invlist = *from_invlist_ptr;
+ SV* to_invmap_sv = *to_invmap_ptr;
+ UV* map = (UV *) SvPVX(to_invmap_sv);
- PERL_ARGS_ASSERT_DO_TRANS_SIMPLE_UTF8;
+ PERL_ARGS_ASSERT_DO_TRANS_COUNT_INVMAP;
s = (U8*)SvPV_nomg(sv, len);
- if (!SvUTF8(sv)) {
- hibit = ! is_utf8_invariant_string(s, len);
- if (hibit) {
- s = bytes_to_utf8(s, &len);
- }
- }
+
send = s + len;
- start = s;
- svp = hv_fetchs(hv, "FINAL", FALSE);
- if (svp)
- final = SvUV(*svp);
+ while (s < send) {
+ UV from;
+ SSize_t i;
+ STRLEN s_len;
+
+ /* Get the code point of the next character in the string */
+ if (! SvUTF8(sv) || UTF8_IS_INVARIANT(*s)) {
+ from = *s;
+ s_len = 1;
+ }
+ else {
+ from = utf8_to_uvchr_buf(s, send, &s_len);
+ if (from == 0 && *s != '\0') {
+ _force_out_malformed_utf8_message(s, send, 0, /*die*/TRUE);
+ }
+ }
- if (grows) {
- /* d needs to be bigger than s, in case e.g. upgrading is required */
- Newx(d, len * 3 + UTF8_MAXBYTES, U8);
- dend = d + len * 3;
- dstart = d;
- }
- else {
- dstart = d = s;
- dend = d + len;
- }
+ /* Look the code point up in the data structure for this tr/// to get
+ * what it maps to */
+ i = _invlist_search(from_invlist, from);
+ assert(i >= 0);
- while (s < send) {
- const UV uv = swash_fetch(rv, s, TRUE);
- if (uv < none) {
- s += UTF8SKIP(s);
- matches++;
- d = uvchr_to_utf8(d, uv);
- }
- else if (uv == none) {
- const int i = UTF8SKIP(s);
- Move(s, d, i, U8);
- d += i;
- s += i;
- }
- else if (uv == extra) {
- s += UTF8SKIP(s);
- matches++;
- d = uvchr_to_utf8(d, final);
- }
- else
- s += UTF8SKIP(s);
-
- if (d > dend) {
- const STRLEN clen = d - dstart;
- const STRLEN nlen = dend - dstart + len + UTF8_MAXBYTES;
- if (!grows)
- Perl_croak(aTHX_ "panic: do_trans_simple_utf8 line %d",__LINE__);
- Renew(dstart, nlen + UTF8_MAXBYTES, U8);
- d = dstart + clen;
- dend = dstart + nlen;
- }
- }
- if (grows || hibit) {
- sv_setpvn(sv, (char*)dstart, d - dstart);
- Safefree(dstart);
- if (grows && hibit)
- Safefree(start);
- }
- else {
- *d = '\0';
- SvCUR_set(sv, d - dstart);
+ if (map[i] != (UV) TR_UNLISTED) {
+ matches++;
+ }
+
+ s += s_len;
}
- SvSETMAGIC(sv);
- SvUTF8_on(sv);
return matches;
}
/* Helper function for do_trans().
- * Handles utf8 cases(*) where search and replacement charlists are
- * identical: so the string isn't modified, and only a count of modifiable
- * chars is needed.
- * Note that it doesn't handle /d or /s, since these modify the string
- * even if the replacement charlist is empty.
- * (*) i.e. where the search or replacement charlists are utf8. sv may
- * or may not be utf8.
+ * Handles cases where an inversion map implementation is to be used and the
+ * search and replacement charlists are either not identical or flags are
+ * present.
+ *
+ * sv may or may not be utf8.
*/
STATIC Size_t
-S_do_trans_count_utf8(pTHX_ SV * const sv)
+S_do_trans_invmap(pTHX_ SV * const sv, AV * const invmap)
{
- const U8 *s;
- const U8 *start = NULL;
- const U8 *send;
+ U8 *s;
+ U8 *send;
+ U8 *d;
+ U8 *s0;
+ U8 *d0;
Size_t matches = 0;
STRLEN len;
- SV* const rv =
-#ifdef USE_ITHREADS
- PAD_SVl(cPADOP->op_padix);
-#else
- MUTABLE_SV(cSVOP->op_sv);
-#endif
- HV* const hv = MUTABLE_HV(SvRV(rv));
- SV* const * const svp = hv_fetchs(hv, "NONE", FALSE);
- const UV none = svp ? SvUV(*svp) : 0x7fffffff;
- const UV extra = none + 1;
- U8 hibit = 0;
-
- PERL_ARGS_ASSERT_DO_TRANS_COUNT_UTF8;
+ SV** const from_invlist_ptr = av_fetch(invmap, 0, TRUE);
+ SV** const to_invmap_ptr = av_fetch(invmap, 1, TRUE);
+ SV** const to_expansion_ptr = av_fetch(invmap, 2, TRUE);
+ NV max_expansion = SvNV(*to_expansion_ptr);
+ SV* from_invlist = *from_invlist_ptr;
+ SV* to_invmap_sv = *to_invmap_ptr;
+ UV* map = (UV *) SvPVX(to_invmap_sv);
+ UV previous_map = TR_OOB;
+ const bool squash = cBOOL(PL_op->op_private & OPpTRANS_SQUASH);
+ const bool delete_unfound = cBOOL(PL_op->op_private & OPpTRANS_DELETE);
+ bool inplace = ! cBOOL(PL_op->op_private & OPpTRANS_GROWS);
+ const UV* from_array = invlist_array(from_invlist);
+ UV final_map;
+ bool out_is_utf8 = SvUTF8(sv);
+ STRLEN s_len;
+
+ PERL_ARGS_ASSERT_DO_TRANS_INVMAP;
+
+ /* A third element in the array indicates that the replacement list was
+ * shorter than the search list, and this element contains the value to use
+ * for the items that don't correspond */
+ if (av_top_index(invmap) >= 3) {
+ SV** const final_map_ptr = av_fetch(invmap, 3, TRUE);
+ SV* const final_map_sv = *final_map_ptr;
+ final_map = SvUV(final_map_sv);
+ }
- s = (const U8*)SvPV_nomg_const(sv, len);
- if (!SvUTF8(sv)) {
- hibit = ! is_utf8_invariant_string(s, len);
- if (hibit) {
- start = s = bytes_to_utf8(s, &len);
- }
+ /* If there is something in the transliteration that could force the input
+ * to be changed to UTF-8, we don't know if we can do it in place, so
+ * assume cannot */
+ if (! out_is_utf8 && (PL_op->op_private & OPpTRANS_CAN_FORCE_UTF8)) {
+ inplace = FALSE;
+ if (max_expansion < 2) {
+ max_expansion = 2;
+ }
}
+
+ s = (U8*)SvPV_nomg(sv, len);
send = s + len;
+ s0 = s;
- while (s < send) {
- const UV uv = swash_fetch(rv, s, TRUE);
- if (uv < none || uv == extra)
- matches++;
- s += UTF8SKIP(s);
+ /* We know by now if there are some possible input strings whose
+ * transliterations are longer than the input. If none can, we just edit
+ * in place. */
+ if (inplace) {
+ d0 = d = s;
+ }
+ else {
+ /* Here, we can't edit in place. We have no idea how much, if any,
+ * this particular input string will grow. However, the compilation
+ * calculated the maximum expansion possible. Use that to allocale
+ * based on the worst case scenario. */
+ Newx(d, len * max_expansion + 1, U8);
+ d0 = d;
}
- if (hibit)
- Safefree(start);
- return matches;
-}
+ restart:
+ /* Do the actual transliteration */
+ while (s < send) {
+ UV from;
+ UV to;
+ SSize_t i;
+ STRLEN s_len;
+
+ /* Get the code point of the next character in the string */
+ if (! SvUTF8(sv) || UTF8_IS_INVARIANT(*s)) {
+ from = *s;
+ s_len = 1;
+ }
+ else {
+ from = utf8_to_uvchr_buf(s, send, &s_len);
+ if (from == 0 && *s != '\0') {
+ _force_out_malformed_utf8_message(s, send, 0, /*die*/TRUE);
+ }
+ }
-/* Helper function for do_trans().
- * Handles utf8 cases(*) involving the /c, /d, /s flags,
- * and where search and replacement charlists aren't identical.
- * (*) i.e. where the search or replacement charlists are utf8. sv may
- * or may not be utf8.
- */
+ /* Look the code point up in the data structure for this tr/// to get
+ * what it maps to */
+ i = _invlist_search(from_invlist, from);
+ assert(i >= 0);
-STATIC Size_t
-S_do_trans_complex_utf8(pTHX_ SV * const sv)
-{
- U8 *start, *send;
- U8 *d;
- Size_t matches = 0;
- const bool squash = cBOOL(PL_op->op_private & OPpTRANS_SQUASH);
- const bool del = cBOOL(PL_op->op_private & OPpTRANS_DELETE);
- const bool grows = cBOOL(PL_op->op_private & OPpTRANS_GROWS);
- SV* const rv =
-#ifdef USE_ITHREADS
- PAD_SVl(cPADOP->op_padix);
-#else
- MUTABLE_SV(cSVOP->op_sv);
-#endif
- HV * const hv = MUTABLE_HV(SvRV(rv));
- SV * const *svp = hv_fetchs(hv, "NONE", FALSE);
- const UV none = svp ? SvUV(*svp) : 0x7fffffff;
- const UV extra = none + 1;
- UV final = 0;
- bool havefinal = FALSE;
- STRLEN len;
- U8 *dstart, *dend;
- U8 hibit = 0;
- U8 *s = (U8*)SvPV_nomg(sv, len);
+ to = map[i];
- PERL_ARGS_ASSERT_DO_TRANS_COMPLEX_UTF8;
+ if (to == (UV) TR_UNLISTED) { /* Just copy the unreplaced character */
+ if (UVCHR_IS_INVARIANT(from) || ! out_is_utf8) {
+ *d++ = from;
+ }
+ else if (SvUTF8(sv)) {
+ Move(s, d, s_len, U8);
+ d += s_len;
+ }
+ else { /* Convert to UTF-8 */
+ append_utf8_from_native_byte(*s, &d);
+ }
- if (!SvUTF8(sv)) {
- hibit = ! is_utf8_invariant_string(s, len);
- if (hibit) {
- s = bytes_to_utf8(s, &len);
+ previous_map = to;
+ s += s_len;
+ continue;
}
- }
- send = s + len;
- start = s;
- svp = hv_fetchs(hv, "FINAL", FALSE);
- if (svp) {
- final = SvUV(*svp);
- havefinal = TRUE;
- }
+ /* Everything else is counted as a match */
+ matches++;
- if (grows) {
- /* d needs to be bigger than s, in case e.g. upgrading is required */
- Newx(d, len * 3 + UTF8_MAXBYTES, U8);
- dend = d + len * 3;
- dstart = d;
- }
- else {
- dstart = d = s;
- dend = d + len;
- }
+ if (to == (UV) TR_SPECIAL_HANDLING) {
+ if (delete_unfound) {
+ previous_map = to;
+ s += s_len;
+ continue;
+ }
- if (squash) {
- UV puv = 0xfeedface;
- while (s < send) {
- UV uv = swash_fetch(rv, s, TRUE);
-
- if (d > dend) {
- const STRLEN clen = d - dstart;
- const STRLEN nlen = dend - dstart + len + UTF8_MAXBYTES;
- if (!grows)
- Perl_croak(aTHX_ "panic: do_trans_complex_utf8 line %d",__LINE__);
- Renew(dstart, nlen + UTF8_MAXBYTES, U8);
- d = dstart + clen;
- dend = dstart + nlen;
- }
- if (uv < none) {
- matches++;
- s += UTF8SKIP(s);
- if (uv != puv) {
- d = uvchr_to_utf8(d, uv);
- puv = uv;
- }
- continue;
- }
- else if (uv == none) { /* "none" is unmapped character */
- const int i = UTF8SKIP(s);
- Move(s, d, i, U8);
- d += i;
- s += i;
- puv = 0xfeedface;
- continue;
- }
- else if (uv == extra && !del) {
- matches++;
- if (havefinal) {
- s += UTF8SKIP(s);
- if (puv != final) {
- d = uvchr_to_utf8(d, final);
- puv = final;
- }
- }
- else {
- STRLEN len;
- uv = utf8n_to_uvchr(s, send - s, &len, UTF8_ALLOW_DEFAULT);
- if (uv != puv) {
- Move(s, d, len, U8);
- d += len;
- puv = uv;
- }
- s += len;
- }
- continue;
- }
- matches++; /* "none+1" is delete character */
- s += UTF8SKIP(s);
- }
- }
- else {
- while (s < send) {
- const UV uv = swash_fetch(rv, s, TRUE);
- if (d > dend) {
- const STRLEN clen = d - dstart;
- const STRLEN nlen = dend - dstart + len + UTF8_MAXBYTES;
- if (!grows)
- Perl_croak(aTHX_ "panic: do_trans_complex_utf8 line %d",__LINE__);
- Renew(dstart, nlen + UTF8_MAXBYTES, U8);
- d = dstart + clen;
- dend = dstart + nlen;
- }
- if (uv < none) {
- matches++;
- s += UTF8SKIP(s);
- d = uvchr_to_utf8(d, uv);
- continue;
- }
- else if (uv == none) { /* "none" is unmapped character */
- const int i = UTF8SKIP(s);
- Move(s, d, i, U8);
- d += i;
- s += i;
- continue;
- }
- else if (uv == extra && !del) {
- matches++;
- s += UTF8SKIP(s);
- d = uvchr_to_utf8(d, final);
- continue;
- }
- matches++; /* "none+1" is delete character */
- s += UTF8SKIP(s);
- }
+ /* Use the final character in the replacement list */
+ to = final_map;
+ }
+ else { /* Here the input code point is to be remapped. The actual
+ value is offset from the base of this entry */
+ to += from - from_array[i];
+ }
+
+ /* If copying all occurrences, or this is the first occurrence, copy it
+ * to the output */
+ if (! squash || to != previous_map) {
+ if (out_is_utf8) {
+ d = uvchr_to_utf8(d, to);
+ }
+ else {
+ if (to >= 256) { /* If need to convert to UTF-8, restart */
+ out_is_utf8 = TRUE;
+ s = s0;
+ d = d0;
+ matches = 0;
+ goto restart;
+ }
+ *d++ = to;
+ }
+ }
+
+ previous_map = to;
+ s += s_len;
}
- if (grows || hibit) {
- sv_setpvn(sv, (char*)dstart, d - dstart);
- Safefree(dstart);
- if (grows && hibit)
- Safefree(start);
+
+ s_len = 0;
+ s += s_len;
+ if (! inplace) {
+ sv_setpvn(sv, (char*)d0, d - d0);
}
else {
*d = '\0';
- SvCUR_set(sv, d - dstart);
+ SvCUR_set(sv, d - d0);
+ }
+
+ if (! SvUTF8(sv) && out_is_utf8) {
+ SvUTF8_on(sv);
}
- SvUTF8_on(sv);
SvSETMAGIC(sv);
return matches;
@@ -627,7 +540,8 @@ S_do_trans_complex_utf8(pTHX_ SV * const sv)
/* Execute a tr//. sv is the value to be translated, while PL_op
* should be an OP_TRANS or OP_TRANSR op, whose op_pv field contains a
- * translation table or whose op_sv field contains a swash.
+ * translation table or whose op_sv field contains an inversion map.
+ *
* Returns a count of number of characters translated
*/
@@ -636,31 +550,49 @@ Perl_do_trans(pTHX_ SV *sv)
{
STRLEN len;
const U8 flags = PL_op->op_private;
- const U8 hasutf = flags & (OPpTRANS_FROM_UTF | OPpTRANS_TO_UTF);
+ bool use_utf8_fcns = cBOOL(flags & OPpTRANS_USE_SVOP);
+ bool identical = cBOOL(flags & OPpTRANS_IDENTICAL);
PERL_ARGS_ASSERT_DO_TRANS;
- if (SvREADONLY(sv) && !(flags & OPpTRANS_IDENTICAL)) {
+ if (SvREADONLY(sv) && ! identical) {
Perl_croak_no_modify();
}
(void)SvPV_const(sv, len);
if (!len)
return 0;
- if (!(flags & OPpTRANS_IDENTICAL)) {
+ if (! identical) {
if (!SvPOKp(sv) || SvTHINKFIRST(sv))
(void)SvPV_force_nomg(sv, len);
(void)SvPOK_only_UTF8(sv);
}
- /* If we use only OPpTRANS_IDENTICAL to bypass the READONLY check,
- * we must also rely on it to choose the readonly strategy.
- */
- if (flags & OPpTRANS_IDENTICAL) {
- return hasutf ? do_trans_count_utf8(sv) : do_trans_count(sv, (OPtrans_map*)cPVOP->op_pv);
- } else if (flags & (OPpTRANS_SQUASH|OPpTRANS_DELETE|OPpTRANS_COMPLEMENT)) {
- return hasutf ? do_trans_complex_utf8(sv) : do_trans_complex(sv, (OPtrans_map*)cPVOP->op_pv);
- } else {
- return hasutf ? do_trans_simple_utf8(sv) : do_trans_simple(sv, (OPtrans_map*)cPVOP->op_pv);
+ if (use_utf8_fcns) {
+ SV* const map =
+#ifdef USE_ITHREADS
+ PAD_SVl(cPADOP->op_padix);
+#else
+ MUTABLE_SV(cSVOP->op_sv);
+#endif
+
+ if (identical) {
+ return do_trans_count_invmap(sv, (AV *) map);
+ }
+ else {
+ return do_trans_invmap(sv, (AV *) map);
+ }
+ }
+ else {
+ const OPtrans_map * const map = (OPtrans_map*)cPVOP->op_pv;
+
+ if (identical) {
+ return do_trans_count(sv, map);
+ }
+ else if (flags & (OPpTRANS_SQUASH|OPpTRANS_DELETE|OPpTRANS_COMPLEMENT)) {
+ return do_trans_complex(sv, map);
+ }
+ else
+ return do_trans_simple(sv, map);
}
}
diff --git a/dump.c b/dump.c
index 78c151da16..f03c3f6153 100644
--- a/dump.c
+++ b/dump.c
@@ -1305,13 +1305,13 @@ S_do_op_dump_bar(pTHX_ I32 level, UV bar, PerlIO *file, const OP *o)
case OP_TRANS:
case OP_TRANSR:
- if (o->op_private & (OPpTRANS_FROM_UTF | OPpTRANS_TO_UTF)) {
- /* utf8: table stored as a swash */
+ if (o->op_private & OPpTRANS_USE_SVOP) {
+ /* utf8: table stored as an inversion map */
#ifndef USE_ITHREADS
- /* with ITHREADS, swash is stored in the pad, and the right pad
+ /* with ITHREADS, it is stored in the pad, and the right pad
* may not be active here, so skip */
S_opdump_indent(aTHX_ o, level, bar, file,
- "SWASH = 0x%" UVxf "\n",
+ "INVMAP = 0x%" UVxf "\n",
PTR2UV(MUTABLE_SV(cSVOPo->op_sv)));
#endif
}
@@ -2986,11 +2986,10 @@ Perl_op_class(pTHX_ const OP *o)
* pointer to a table of shorts used to look up translations.
* Under utf8, however, a simple table isn't practical; instead,
* the OP is an SVOP (or, under threads, a PADOP),
- * and the SV is a reference to a swash
- * (i.e., an RV pointing to an HV).
+ * and the SV is an AV.
*/
return (!custom &&
- (o->op_private & (OPpTRANS_TO_UTF|OPpTRANS_FROM_UTF))
+ (o->op_private & OPpTRANS_USE_SVOP)
)
#if defined(USE_ITHREADS)
? OPclass_PADOP : OPclass_PVOP;
diff --git a/embed.fnc b/embed.fnc
index 87c5159b8c..76ec0c0909 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1870,7 +1870,7 @@ Apd |void |sv_vsetpvfn |NN SV *const sv|NN const char *const pat|const STRLEN pa
ApR |NV |str_to_version |NN SV *sv
EXpR |SV* |swash_init |NN const char* pkg|NN const char* name|NN SV* listsv|I32 minbits|I32 none
EXp |UV |swash_fetch |NN SV *swash|NN const U8 *ptr|bool do_utf8
-#if defined(PERL_IN_REGCOMP_C)
+#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C)
EiR |SV* |add_cp_to_invlist |NULLOK SV* invlist|const UV cp
Ei |void |invlist_extend |NN SV* const invlist|const UV len
Ei |void |invlist_set_len|NN SV* const invlist|const UV len|const bool offset
@@ -1922,7 +1922,8 @@ EpX |SV* |invlist_clone |NN SV* const invlist|NULLOK SV* newlist
#endif
#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) \
|| defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) \
- || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C)
+ || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C) \
+ || defined(PERL_IN_DOOP_C)
EiRT |UV* |invlist_array |NN SV* const invlist
EiRT |bool |is_invlist |NULLOK SV* const invlist
EiRT |bool* |get_invlist_offset_addr|NN SV* invlist
@@ -2308,9 +2309,8 @@ p |void |init_constants
SR |Size_t |do_trans_simple |NN SV * const sv|NN const OPtrans_map * const tbl
SR |Size_t |do_trans_count |NN SV * const sv|NN const OPtrans_map * const tbl
SR |Size_t |do_trans_complex |NN SV * const sv|NN const OPtrans_map * const tbl
-SR |Size_t |do_trans_simple_utf8 |NN SV * const sv
-SR |Size_t |do_trans_count_utf8 |NN SV * const sv
-SR |Size_t |do_trans_complex_utf8 |NN SV * const sv
+SR |Size_t |do_trans_invmap |NN SV * const sv|NN AV * const map
+SR |Size_t |do_trans_count_invmap |NN SV * const sv|NN AV * const map
#endif
#if defined(PERL_IN_GV_C)
diff --git a/embed.h b/embed.h
index 425ba304da..5a1c6fe190 100644
--- a/embed.h
+++ b/embed.h
@@ -1014,7 +1014,6 @@
# endif
# if defined(PERL_IN_REGCOMP_C)
#define add_above_Latin1_folds(a,b,c) S_add_above_Latin1_folds(aTHX_ a,b,c)
-#define add_cp_to_invlist(a,b) S_add_cp_to_invlist(aTHX_ a,b)
#define add_data S_add_data
#define add_multi_match(a,b,c) S_add_multi_match(aTHX_ a,b,c)
#define change_engine_size(a,b) S_change_engine_size(aTHX_ a,b)
@@ -1024,20 +1023,13 @@
#define edit_distance S_edit_distance
#define get_ANYOFM_contents(a) S_get_ANYOFM_contents(aTHX_ a)
#define get_ANYOF_cp_list_for_ssc(a,b) S_get_ANYOF_cp_list_for_ssc(aTHX_ a,b)
-#define get_invlist_iter_addr S_get_invlist_iter_addr
#define grok_bslash_N(a,b,c,d,e,f,g) S_grok_bslash_N(aTHX_ a,b,c,d,e,f,g)
#define handle_named_backref(a,b,c,d) S_handle_named_backref(aTHX_ a,b,c,d)
#define handle_possible_posix(a,b,c,d,e) S_handle_possible_posix(aTHX_ a,b,c,d,e)
#define handle_regex_sets(a,b,c,d,e) S_handle_regex_sets(aTHX_ a,b,c,d,e)
#define handle_user_defined_property(a,b,c,d,e,f,g,h,i,j) Perl_handle_user_defined_property(aTHX_ a,b,c,d,e,f,g,h,i,j)
#define invlist_contents(a,b) S_invlist_contents(aTHX_ a,b)
-#define invlist_extend(a,b) S_invlist_extend(aTHX_ a,b)
-#define invlist_highest S_invlist_highest
#define invlist_is_iterating S_invlist_is_iterating
-#define invlist_iterfinish S_invlist_iterfinish
-#define invlist_iterinit S_invlist_iterinit
-#define invlist_iternext S_invlist_iternext
-#define invlist_set_len(a,b,c) S_invlist_set_len(aTHX_ a,b,c)
#define is_ssc_worth_it S_is_ssc_worth_it
#define join_exact(a,b,c,d,e,f,g) S_join_exact(aTHX_ a,b,c,d,e,f,g)
#define make_exactf_invlist(a,b) S_make_exactf_invlist(aTHX_ a,b)
@@ -1083,6 +1075,16 @@
# if defined(PERL_IN_REGCOMP_C) || defined (PERL_IN_DUMP_C) || defined(PERL_IN_OP_C)
#define _invlist_dump(a,b,c,d) Perl__invlist_dump(aTHX_ a,b,c,d)
# endif
+# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C)
+#define add_cp_to_invlist(a,b) S_add_cp_to_invlist(aTHX_ a,b)
+#define get_invlist_iter_addr S_get_invlist_iter_addr
+#define invlist_extend(a,b) S_invlist_extend(aTHX_ a,b)
+#define invlist_highest S_invlist_highest
+#define invlist_iterfinish S_invlist_iterfinish
+#define invlist_iterinit S_invlist_iterinit
+#define invlist_iternext S_invlist_iternext
+#define invlist_set_len(a,b,c) S_invlist_set_len(aTHX_ a,b,c)
+# endif
# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_PERL_C) || defined(PERL_IN_UTF8_C)
#define _invlistEQ(a,b,c) Perl__invlistEQ(aTHX_ a,b,c)
#define _new_invlist_C_array(a) Perl__new_invlist_C_array(aTHX_ a)
@@ -1094,7 +1096,7 @@
#endif
#define regprop(a,b,c,d,e) Perl_regprop(aTHX_ a,b,c,d,e)
# endif
-# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C)
+# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C)
#define _invlist_contains_cp S__invlist_contains_cp
#define _invlist_len S__invlist_len
#define _invlist_search Perl__invlist_search
@@ -1603,11 +1605,10 @@
# endif
# if defined(PERL_IN_DOOP_C)
#define do_trans_complex(a,b) S_do_trans_complex(aTHX_ a,b)
-#define do_trans_complex_utf8(a) S_do_trans_complex_utf8(aTHX_ a)
#define do_trans_count(a,b) S_do_trans_count(aTHX_ a,b)
-#define do_trans_count_utf8(a) S_do_trans_count_utf8(aTHX_ a)
+#define do_trans_count_invmap(a,b) S_do_trans_count_invmap(aTHX_ a,b)
+#define do_trans_invmap(a,b) S_do_trans_invmap(aTHX_ a,b)
#define do_trans_simple(a,b) S_do_trans_simple(aTHX_ a,b)
-#define do_trans_simple_utf8(a) S_do_trans_simple_utf8(aTHX_ a)
# endif
# if defined(PERL_IN_DUMP_C)
#define deb_curcv(a) S_deb_curcv(aTHX_ a)
diff --git a/invlist_inline.h b/invlist_inline.h
index 76d6dda998..33f8aee385 100644
--- a/invlist_inline.h
+++ b/invlist_inline.h
@@ -14,7 +14,8 @@
|| defined(PERL_IN_REGEXEC_C) \
|| defined(PERL_IN_TOKE_C) \
|| defined(PERL_IN_PP_C) \
- || defined(PERL_IN_OP_C)
+ || defined(PERL_IN_OP_C) \
+ || defined(PERL_IN_DOOP_C)
/* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
* etc */
@@ -92,7 +93,7 @@ S_invlist_array(SV* const invlist)
}
#endif
-#if defined(PERL_IN_REGCOMP_C)
+#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C)
PERL_STATIC_INLINE void
S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
diff --git a/lib/B/Deparse.pm b/lib/B/Deparse.pm
index 88555ffbe3..1ae4619d5d 100644
--- a/lib/B/Deparse.pm
+++ b/lib/B/Deparse.pm
@@ -279,6 +279,8 @@ BEGIN { for (qw[ const stringify rv2sv list glob pushmark null aelem
# _pessimise_walk(): recursively walk the optree of a sub,
# possibly undoing optimisations along the way.
+sub DEBUG { 0 }
+
sub _pessimise_walk {
my ($self, $startop) = @_;
@@ -5714,100 +5716,81 @@ sub tr_chr {
}
}
-# XXX This doesn't yet handle all cases correctly either
+sub tr_invmap {
+ my ($invlist_ref, $map_ref) = @_;
-sub tr_decode_utf8 {
- my($swash_hv, $flags) = @_;
- my %swash = $swash_hv->ARRAY;
- my $final = undef;
- $final = $swash{'FINAL'}->IV if exists $swash{'FINAL'};
- my $none = $swash{"NONE"}->IV;
- my $extra = $none + 1;
- my(@from, @delfrom, @to);
- my $line;
- foreach $line (split /\n/, $swash{'LIST'}->PV) {
- my($min, $max, $result) = split(/\t/, $line);
- $min = hex $min;
- if (length $max) {
- $max = hex $max;
- } else {
- $max = $min;
- }
- $result = hex $result;
- if ($result == $extra) {
- push @delfrom, [$min, $max];
- } else {
- push @from, [$min, $max];
- push @to, [$result, $result + $max - $min];
- }
- }
- for my $i (0 .. $#from) {
- if ($from[$i][0] == ord '-') {
- unshift @from, splice(@from, $i, 1);
- unshift @to, splice(@to, $i, 1);
- last;
- } elsif ($from[$i][1] == ord '-') {
- $from[$i][1]--;
- $to[$i][1]--;
- unshift @from, ord '-';
- unshift @to, ord '-';
- last;
- }
- }
- for my $i (0 .. $#delfrom) {
- if ($delfrom[$i][0] == ord '-') {
- push @delfrom, splice(@delfrom, $i, 1);
- last;
- } elsif ($delfrom[$i][1] == ord '-') {
- $delfrom[$i][1]--;
- push @delfrom, ord '-';
- last;
- }
+ my $infinity = ~0 >> 1; # IV_MAX
+ my $from = "";
+ my $to = "";
+
+ for my $i (0.. @$invlist_ref - 1) {
+ my $this_from = $invlist_ref->[$i];
+ my $map = $map_ref->[$i];
+ my $upper = ($i < @$invlist_ref - 1)
+ ? $invlist_ref->[$i+1]
+ : $infinity;
+ my $range = $upper - $this_from - 1;
+ if (DEBUG) {
+ print STDERR "i=$i, from=$this_from, upper=$upper, range=$range\n";
+ }
+ next if $map == ~0;
+ next if $map == ~0 - 1;
+ $from .= tr_chr($this_from);
+ $to .= tr_chr($map);
+ next if $range == 0; # Single code point
+ if ($range == 1) { # Adjacent code points
+ $from .= tr_chr($this_from + 1);
+ $to .= tr_chr($map + 1);
+ }
+ elsif ($upper != $infinity) {
+ $from .= "-" . tr_chr($this_from + $range);
+ $to .= "-" . tr_chr($map + $range);
+ }
+ else {
+ $from .= "-INFTY";
+ $to .= "-INFTY";
+ }
}
- if (defined $final and $to[$#to][1] != $final) {
- push @to, [$final, $final];
+
+ return ($from, $to);
+}
+
+sub tr_decode_utf8 {
+ my($tr_av, $flags) = @_;
+ printf STDERR "flags=0x%x\n", $flags if DEBUG;
+ my $invlist = $tr_av->ARRAYelt(0);
+ my @invlist = unpack("J*", $invlist->PV);
+ my @map = unpack("J*", $tr_av->ARRAYelt(1)->PV);
+
+ if (DEBUG) {
+ for my $i (0 .. @invlist - 1) {
+ printf STDERR "[%d]\t%x\t", $i, $invlist[$i];
+ my $map = $map[$i];
+ if ($map == ~0) {
+ print STDERR "TR_UNMAPPED\n";
+ }
+ elsif ($map == ~0 - 1) {
+ print STDERR "TR_SPECIAL\n";
+ }
+ else {
+ printf STDERR "%x\n", $map;
+ }
+ }
}
- push @from, @delfrom;
+
+ my ($from, $to) = tr_invmap(\@invlist, \@map);
+
if ($flags & OPpTRANS_COMPLEMENT) {
- my @newfrom;
- my $next = 0;
- for my $i (0 .. $#from) {
- push @newfrom, [$next, $from[$i][0] - 1];
- $next = $from[$i][1] + 1;
- }
- @from = ();
- for my $range (@newfrom) {
- if ($range->[0] <= $range->[1]) {
- push @from, $range;
- }
- }
- }
- my($from, $to, $diff);
- for my $chunk (@from) {
- $diff = $chunk->[1] - $chunk->[0];
- if ($diff > 1) {
- $from .= tr_chr($chunk->[0]) . "-" . tr_chr($chunk->[1]);
- } elsif ($diff == 1) {
- $from .= tr_chr($chunk->[0]) . tr_chr($chunk->[1]);
- } else {
- $from .= tr_chr($chunk->[0]);
- }
+ shift @map;
+ pop @invlist;
+ my $throw_away;
+ ($from, $throw_away) = tr_invmap(\@invlist, \@map);
}
- for my $chunk (@to) {
- $diff = $chunk->[1] - $chunk->[0];
- if ($diff > 1) {
- $to .= tr_chr($chunk->[0]) . "-" . tr_chr($chunk->[1]);
- } elsif ($diff == 1) {
- $to .= tr_chr($chunk->[0]) . tr_chr($chunk->[1]);
- } else {
- $to .= tr_chr($chunk->[0]);
- }
+
+ if (DEBUG) {
+ print STDERR "Returning ", escape_str($from), "/",
+ escape_str($to), "\n";
}
- #$final = sprintf("%04x", $final) if defined $final;
- #$none = sprintf("%04x", $none) if defined $none;
- #$extra = sprintf("%04x", $extra) if defined $extra;
- #print STDERR "final: $final\n none: $none\nextra: $extra\n";
- #print STDERR $swash{'LIST'}->PV;
return (escape_str($from), escape_str($to));
}
@@ -5821,9 +5804,9 @@ sub pp_trans {
($from, $to) = tr_decode_byte($op->pv, $priv_flags);
} elsif ($class eq "PADOP") {
($from, $to)
- = tr_decode_utf8($self->padval($op->padix)->RV, $priv_flags);
+ = tr_decode_utf8($self->padval($op->padix), $priv_flags);
} else { # class($op) eq "SVOP"
- ($from, $to) = tr_decode_utf8($op->sv->RV, $priv_flags);
+ ($from, $to) = tr_decode_utf8($op->sv, $priv_flags);
}
my $flags = "";
$flags .= "c" if $priv_flags & OPpTRANS_COMPLEMENT;
diff --git a/op.c b/op.c
index 12ee52a453..91eb50aee9 100644
--- a/op.c
+++ b/op.c
@@ -1059,7 +1059,7 @@ Perl_op_clear(pTHX_ OP *o)
case OP_TRANS:
case OP_TRANSR:
if ( (o->op_type == OP_TRANS || o->op_type == OP_TRANSR)
- && (o->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF)))
+ && (o->op_private & OPpTRANS_USE_SVOP))
{
#ifdef USE_ITHREADS
if (cPADOPo->op_padix > 0) {
@@ -6784,8 +6784,8 @@ static int uvcompare(const void *a, const void *b)
* OPpTRANS_SQUASH
* OPpTRANS_DELETE
* flags as appropriate; this function may add
- * OPpTRANS_FROM_UTF
- * OPpTRANS_TO_UTF
+ * OPpTRANS_USE_SVOP
+ * OPpTRANS_CAN_FORCE_UTF8
* OPpTRANS_IDENTICAL
* OPpTRANS_GROWS
* flags
@@ -6794,416 +6794,1028 @@ static int uvcompare(const void *a, const void *b)
static OP *
S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
{
+ /* This function compiles a tr///, from data gathered from toke.c, into a
+ * form suitable for use by do_trans() in doop.c at runtime.
+ *
+ * It first normalizes the data, while discarding extraneous inputs; then
+ * writes out the compiled data. The normalization allows for complete
+ * analysis, and avoids some false negatives and positives earlier versions
+ * of this code had.
+ *
+ * The normalization form is an inversion map (described below in detail).
+ * This is essentially the compiled form for tr///'s that require UTF-8,
+ * and its easy to use it to write the 257-byte table for tr///'s that
+ * don't need UTF-8. That table is identical to what's been in use for
+ * many perl versions, except that it doesn't handle some edge cases that
+ * it used to, involving code points above 255. The UTF-8 form now handles
+ * these. (This could be changed with extra coding should it shown to be
+ * desirable.)
+ *
+ * If the complement (/c) option is specified, the lhs string (tstr) is
+ * parsed into an inversion list. Complementing these is trivial. Then a
+ * complemented tstr is built from that, and used thenceforth. This hides
+ * the fact that it was complemented from almost all successive code.
+ *
+ * One of the important characteristics to know about the input is whether
+ * the transliteration may be done in place, or does a temporary need to be
+ * allocated, then copied. If the replacement for every character in every
+ * possible string takes up no more bytes than the the character it
+ * replaces, then it can be edited in place. Otherwise the replacement
+ * could "grow", depending on the strings being processed. Some inputs
+ * won't grow, and might even shrink under /d, but some inputs could grow,
+ * so we have to assume any given one might grow. On very long inputs, the
+ * temporary could eat up a lot of memory, so we want to avoid it if
+ * possible. For non-UTF-8 inputs, everything is single-byte, so can be
+ * edited in place, unless there is something in the pattern that could
+ * force it into UTF-8. The inversion map makes it feasible to determine
+ * this. Previous versions of this code pretty much punted on determining
+ * if UTF-8 could be edited in place. Now, this code is rigorous in making
+ * that determination.
+ *
+ * Another characteristic we need to know is whether the lhs and rhs are
+ * identical. If so, and no other flags are present, the only effect of
+ * the tr/// is to count the characters present in the input that are
+ * mentioned in the lhs string. The implementation of that is easier and
+ * runs faster than the more general case. Normalizing here allows for
+ * accurate determination of this. Previously there were false negatives
+ * possible.
+ *
+ * Instead of 'transliterated', the comments here use 'unmapped' for the
+ * characters that are left unchanged by the operation; otherwise they are
+ * 'mapped'
+ *
+ * The lhs of the tr/// is here referred to as the t side.
+ * The rhs of the tr/// is here referred to as the r side.
+ */
+
SV * const tstr = ((SVOP*)expr)->op_sv;
SV * const rstr = ((SVOP*)repl)->op_sv;
STRLEN tlen;
STRLEN rlen;
- const U8 *t = (U8*)SvPV_const(tstr, tlen);
- const U8 *r = (U8*)SvPV_const(rstr, rlen);
- Size_t i, j;
- bool grows = FALSE;
- OPtrans_map *tbl;
- SSize_t struct_size; /* malloced size of table struct */
-
+ const U8 * t0 = (U8*)SvPV_const(tstr, tlen);
+ const U8 * r0 = (U8*)SvPV_const(rstr, rlen);
+ const U8 * t = t0;
+ const U8 * r = r0;
+ Size_t t_count = 0, r_count = 0; /* Number of characters in search and
+ replacement lists */
+
+ /* khw thinks some of the private flags for this op are quaintly named.
+ * OPpTRANS_GROWS for example is TRUE if the replacement for some lhs
+ * character when represented in UTF-8 is longer than the original
+ * character's UTF-8 representation */
const bool complement = cBOOL(o->op_private & OPpTRANS_COMPLEMENT);
const bool squash = cBOOL(o->op_private & OPpTRANS_SQUASH);
const bool del = cBOOL(o->op_private & OPpTRANS_DELETE);
- SV* swash;
+
+ /* Set to true if there is some character < 256 in the lhs that maps to >
+ * 255. If so, a non-UTF-8 match string can be forced into requiring to be
+ * in UTF-8 by a tr/// operation. */
+ bool can_force_utf8 = FALSE;
+
+ /* What is the maximum expansion factor in UTF-8 transliterations. If a
+ * 2-byte UTF-8 encoded character is to be replaced by a 3-byte one, its
+ * expansion factor is 1.5. This number is used at runtime to calculate
+ * how much space to allocate for non-inplace transliterations. Without
+ * this number, the worst case is 14, which is extremely unlikely to happen
+ * in real life, and would require significant memory overhead. */
+ NV max_expansion = 1.;
+
+ SSize_t t_range_count, r_range_count, min_range_count;
+ UV* t_array;
+ SV* t_invlist;
+ UV* r_map;
+ UV r_cp, t_cp;
+ IV t_cp_end = -1;
+ UV r_cp_end;
+ Size_t len;
+ AV* invmap;
+ UV final_map = TR_UNLISTED; /* The final character in the replacement
+ list, updated as we go along. Initialize
+ to something illegal */
+
+ bool rstr_utf8 = cBOOL(SvUTF8(rstr));
+ bool tstr_utf8 = cBOOL(SvUTF8(tstr));
+
+ const U8* tend = t + tlen;
+ const U8* rend = r + rlen;
+
+ SV * inverted_tstr = NULL;
+
+ Size_t i;
+ unsigned int pass2;
+
+ /* This routine implements detection of a transliteration having a longer
+ * UTF-8 representation than its source, by partitioning all the possible
+ * code points of the platform into equivalence classes of the same UTF-8
+ * byte length in the first pass. As it constructs the mappings, it carves
+ * these up into smaller chunks, but doesn't merge any together. This
+ * makes it easy to find the instances it's looking for. A second pass is
+ * done after this has been determined which merges things together to
+ * shrink the table for runtime. For ASCII platforms, the table is
+ * trivial, given below, and uses the fundamental characteristics of UTF-8
+ * to construct the values. For EBCDIC, it isn't so, and we rely on a
+ * table constructed by the perl script that generates these kinds of
+ * things */
+#ifndef EBCDIC
+ UV PL_partition_by_byte_length[] = {
+ 0,
+ 0x80,
+ (32 * (1UL << ( UTF_ACCUMULATION_SHIFT))),
+ (32 * (1UL << ( UTF_ACCUMULATION_SHIFT))),
+ (16 * (1UL << (2 * UTF_ACCUMULATION_SHIFT))),
+ ( 8 * (1UL << (3 * UTF_ACCUMULATION_SHIFT))),
+ ( 4 * (1UL << (4 * UTF_ACCUMULATION_SHIFT))),
+ ( 2 * (1UL << (5 * UTF_ACCUMULATION_SHIFT)))
+
+# ifdef UV_IS_QUAD
+ ,
+ ( (1UL << (6 * UTF_ACCUMULATION_SHIFT)))
+
+# endif
+#endif
+ };
PERL_ARGS_ASSERT_PMTRANS;
PL_hints |= HINT_BLOCK_SCOPE;
- if (SvUTF8(tstr))
- o->op_private |= OPpTRANS_FROM_UTF;
+ /* If /c, the search list is sorted and complemented. This is now done by
+ * creating an inversion list from it, and then trivially inverting that.
+ * The previous implementation used qsort, but creating the list
+ * automatically keeps it sorted as we go along */
+ if (complement) {
+ UV start, end;
+ SV * inverted_tlist = _new_invlist(tlen);
+ Size_t temp_len;
+
+ while (t < tend) {
+
+ /* Non-utf8 strings don't have ranges, so each character is listed
+ * out */
+ if (! tstr_utf8) {
+ inverted_tlist = add_cp_to_invlist(inverted_tlist, *t);
+ t++;
+ }
+ else { /* But UTF-8 strings have been parsed in toke.c to have
+ * ranges if appropriate. */
+ UV t_cp;
+ Size_t t_char_len;
+
+ /* Get the first character */
+ t_cp = valid_utf8_to_uvchr(t, &t_char_len);
+ t += t_char_len;
+
+ /* If the next byte indicates that this wasn't the first
+ * element of a range, the range is just this one */
+ if (t >= tend || *t != RANGE_INDICATOR) {
+ inverted_tlist = add_cp_to_invlist(inverted_tlist, t_cp);
+ }
+ else { /* Otherwise, ignore the indicator byte, and get the
+ final element, and add the whole range */
+ t++;
+ t_cp_end = valid_utf8_to_uvchr(t, &t_char_len);
+ t += t_char_len;
+
+ inverted_tlist = _add_range_to_invlist(inverted_tlist,
+ t_cp, t_cp_end);
+ }
+ }
+ } /* End of parse through tstr */
+
+ /* The inversion list is done; now invert it */
+ _invlist_invert(inverted_tlist);
+
+ /* Now go through the inverted list and create a new tstr for the rest
+ * of the routine to use. Since the UTF-8 version can have ranges, and
+ * can be much more compact than the non-UTF-8 version, we create the
+ * string in UTF-8 even if not necessary. (This is just an intermediate
+ * value that gets thrown away anyway.) */
+ invlist_iterinit(inverted_tlist);
+ inverted_tstr = newSVpvs("");
+ while (invlist_iternext(inverted_tlist, &start, &end)) {
+ U8 temp[UTF8_MAXBYTES];
+ U8 * temp_end_pos;
+
+ /* IV_MAX keeps things from going out of bounds */
+ start = MIN(IV_MAX, start);
+ end = MIN(IV_MAX, end);
+
+ temp_end_pos = uvchr_to_utf8(temp, start);
+ sv_catpvn(inverted_tstr, (char *) temp, temp_end_pos - temp);
+
+ if (start != end) {
+ Perl_sv_catpvf(aTHX_ inverted_tstr, "%c", RANGE_INDICATOR);
+ temp_end_pos = uvchr_to_utf8(temp, end);
+ sv_catpvn(inverted_tstr, (char *) temp, temp_end_pos - temp);
+ }
+ }
+
+ /* Set up so the remainder of the routine uses this complement, instead
+ * of the actual input */
+ t0 = t = (U8*)SvPV_const(inverted_tstr, temp_len);
+ tend = t0 + temp_len;
+ tstr_utf8 = TRUE;
+
+ SvREFCNT_dec_NN(inverted_tlist);
+ }
+
+ /* For non-/d, an empty rhs means to use the lhs */
+ if (rlen == 0 && ! del) {
+ r0 = t0;
+ rend = tend;
+ rstr_utf8 = tstr_utf8;
+ }
+
+ t_invlist = _new_invlist(1);
+
+ /* Parse the (potentially adjusted) input, creating the inversion map.
+ * This is done in two passes. The first pass is to determine if the
+ * transliteration can be done in place. The inversion map it creates
+ * could be used, but generally would be larger and slower to run than the
+ * output of the second pass, which starts with a more compact table and
+ * allows more ranges to be merged */
+ for (pass2 = 0; pass2 < 2; pass2++) {
- if (SvUTF8(rstr))
- o->op_private |= OPpTRANS_TO_UTF;
+ /* Initialize to a single range */
+ t_invlist = _add_range_to_invlist(t_invlist, 0, UV_MAX);
- if (o->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF)) {
+ /* In the second pass, we just have the single range */
- /* for utf8 translations, op_sv will be set to point to a swash
- * containing codepoint ranges. This is done by first assembling
- * a textual representation of the ranges in listsv then compiling
- * it using swash_init(). For more details of the textual format,
- * see L<perlunicode.pod/"User-Defined Character Properties"> .
+ if (pass2) {
+ len = 1;
+ t_array = invlist_array(t_invlist);
+ }
+ else {
+
+ /* But in the first pass, the lhs is partitioned such that the
+ * number of UTF-8 bytes required to represent a code point in each
+ * partition is the same as the number for any other code point in
+ * that partion. We copy the pre-compiled partion. */
+ len = C_ARRAY_LENGTH(PL_partition_by_byte_length);
+ invlist_extend(t_invlist, len);
+ t_array = invlist_array(t_invlist);
+ Copy(PL_partition_by_byte_length, t_array, len, UV);
+ invlist_set_len(t_invlist,
+ len,
+ *(get_invlist_offset_addr(t_invlist)));
+ Newx(r_map, len + 1, UV);
+ }
+
+ /* And the mapping of each of the ranges is initialized. Initially,
+ * everything is TR_UNLISTED. */
+ for (i = 0; i < len; i++) {
+ r_map[i] = TR_UNLISTED;
+ }
+
+ t = t0;
+ t_count = 0;
+ r = r0;
+ r_count = 0;
+ t_range_count = r_range_count = 0;
+
+ /* Now go through the search list constructing an inversion map. The
+ * input is not necessarily in any particular order. Making it an
+ * inversion map orders it, potentially simplifying, and makes it easy
+ * to deal with at run time. This is the only place in core that
+ * generates an inversion map; if others were introduced, it might be
+ * better to create general purpose routines to handle them.
+ * (Inversion maps are created in perl in other places.)
+ *
+ * An inversion map consists of two parallel arrays. One is
+ * essentially an inversion list: an ordered list of code points such
+ * that each element gives the first code point of a range of
+ * consecutive code points that map to the element in the other array
+ * that has the same index as this one (in other words, the
+ * corresponding element). Thus the range extends up to (but not
+ * including) the code point given by the next higher element. In a
+ * true inversion map, the corresponding element in the other array
+ * gives the mapping of the first code point in the range, with the
+ * understanding that the next higher code point in the inversion
+ * list's range will map to the next higher code point in the map.
+ *
+ * So if at element [i], let's say we have:
+ *
+ * t_invlist r_map
+ * [i] A a
+ *
+ * This means that A => a, B => b, C => c.... Let's say that the
+ * situation is such that:
+ *
+ * [i+1] L -1
+ *
+ * This means the sequence that started at [i] stops at K => k. This
+ * illustrates that you need to look at the next element to find where
+ * a sequence stops. Except, the highest element in the inversion list
+ * begins a range that is understood to extend to the platform's
+ * infinity.
+ *
+ * This routine modifies traditional inversion maps to reserve two
+ * mappings:
+ *
+ * TR_UNLISTED (or -1) indicates that the no code point in the range
+ * is listed in the tr/// searchlist. At runtime, these are
+ * always passed through unchanged. In the inversion map, all
+ * points in the range are mapped to -1, instead of increasing,
+ * like the 'L' in the example above.
+ *
+ * We start the parse with every code point mapped to this, and as
+ * we parse and find ones that are listed in the search list, we
+ * carve out ranges as we go along that override that.
+ *
+ * TR_SPECIAL_HANDLING (or -2) indicates that every code point in the
+ * range needs special handling. Again, all code points in the
+ * range are mapped to -2, instead of increasing.
+ *
+ * Under /d this value means the code point should be deleted from
+ * the transliteration when encountered.
+ *
+ * Otherwise, it marks that every code point in the range is to
+ * map to the final character in the replacement list. This
+ * happens only when the replacement list is shorter than the
+ * search one, so there are things in the search list that have no
+ * correspondence in the replacement list. For example, in
+ * tr/a-z/A/, 'A' is the final value, and the inversion map
+ * generated for this would be like this:
+ * \0 => -1
+ * a => A
+ * b-z => -2
+ * z+1 => -1
+ * 'A' appears once, then the remainder of the range maps to -2.
+ * The use of -2 isn't strictly necessary, as an inversion map is
+ * capable of representing this situation, but not nearly so
+ * compactly, and this is actually quite commonly encountered.
+ * Indeed, the original design of this code used a full inversion
+ * map for this. But things like
+ * tr/\0-\x{FFFF}/A/
+ * generated huge data structures, slowly, and the execution was
+ * also slow. So the current scheme was implemented.
+ *
+ * So, if the next element in our example is:
+ *
+ * [i+2] Q q
+ *
+ * Then all of L, M, N, O, and P map to TR_UNLISTED. If the next
+ * elements are
+ *
+ * [i+3] R z
+ * [i+4] S TR_UNLISTED
+ *
+ * Then Q => q; R => z; and S => TR_UNLISTED. If [i+4] (the 'S') is
+ * the final element in the arrays, every code point from S to infinity
+ * maps to TR_UNLISTED.
+ *
*/
+ /* Finish up range started in what otherwise would
+ * have been the final iteration */
+ while (t < tend || t_range_count > 0) {
+ bool adjacent_to_range_above = FALSE;
+ bool adjacent_to_range_below = FALSE;
+
+ bool merge_with_range_above = FALSE;
+ bool merge_with_range_below = FALSE;
+
+ SSize_t i, span, invmap_range_length_remaining;
+
+ /* If we are in the middle of processing a range in the 'target'
+ * side, the previous iteration has set us up. Otherwise, look at
+ * the next character in the search list */
+ if (t_range_count <= 0) {
+ if (! tstr_utf8) {
+
+ /* Here, not in the middle of a range, and not UTF-8. The
+ * next code point is the single byte where we're at */
+ t_cp = *t;
+ t_range_count = 1;
+ t++;
+ }
+ else {
+ Size_t t_char_len;
+
+ /* Here, not in the middle of a range, and is UTF-8. The
+ * next code point is the next UTF-8 char in the input. We
+ * know the input is valid, because the toker constructed
+ * it */
+ t_cp = valid_utf8_to_uvchr(t, &t_char_len);
+ t += t_char_len;
+
+ /* UTF-8 strings (only) have been parsed in toke.c to have
+ * ranges. See if the next byte indicates that this was
+ * the first element of a range. If so, get the final
+ * element and calculate the range size. If not, the range
+ * size is 1 */
+ if (t < tend && *t == RANGE_INDICATOR) {
+ t++;
+ t_range_count = valid_utf8_to_uvchr(t, &t_char_len)
+ - t_cp + 1;
+ t += t_char_len;
+ }
+ else {
+ t_range_count = 1;
+ }
+ }
- SV* const listsv = newSVpvs("# comment\n");
- SV* transv = NULL;
- const U8* tend = t + tlen;
- const U8* rend = r + rlen;
- STRLEN ulen;
- UV tfirst = 1;
- UV tlast = 0;
- IV tdiff;
- STRLEN tcount = 0;
- UV rfirst = 1;
- UV rlast = 0;
- IV rdiff;
- STRLEN rcount = 0;
- IV diff;
- I32 none = 0;
- U32 max = 0;
- I32 bits;
- I32 havefinal = 0;
- U32 final = 0;
- const I32 from_utf = o->op_private & OPpTRANS_FROM_UTF;
- const I32 to_utf = o->op_private & OPpTRANS_TO_UTF;
- U8* tsave = NULL;
- U8* rsave = NULL;
- const U32 flags = UTF8_ALLOW_DEFAULT;
-
- if (!from_utf) {
- STRLEN len = tlen;
- t = tsave = bytes_to_utf8(t, &len);
- tend = t + len;
- }
- if (!to_utf && rlen) {
- STRLEN len = rlen;
- r = rsave = bytes_to_utf8(r, &len);
- rend = r + len;
- }
+ /* Count the total number of listed code points * */
+ t_count += t_range_count;
+ }
-/* There is a snag with this code on EBCDIC: scan_const() in toke.c has
- * encoded chars in native encoding which makes ranges in the EBCDIC 0..255
- * odd. */
+ /* Similarly, get the next character in the replacement list */
+ if (r_range_count <= 0) {
+ if (r >= rend) {
- if (complement) {
- /* utf8 and /c:
- * replace t/tlen/tend with a version that has the ranges
- * complemented
- */
- U8 tmpbuf[UTF8_MAXBYTES+1];
- UV *cp;
- UV nextmin = 0;
- Newx(cp, 2*tlen, UV);
- i = 0;
- transv = newSVpvs("");
-
- /* convert search string into array of (start,end) range
- * codepoint pairs stored in cp[]. Most "ranges" will start
- * and end at the same char */
- while (t < tend) {
- cp[2*i] = utf8n_to_uvchr(t, tend-t, &ulen, flags);
- t += ulen;
- /* the toker converts X-Y into (X, RANGE_INDICATOR, Y) */
- if (t < tend && *t == RANGE_INDICATOR) {
- t++;
- cp[2*i+1] = utf8n_to_uvchr(t, tend-t, &ulen, flags);
- t += ulen;
- }
- else {
- cp[2*i+1] = cp[2*i];
- }
- i++;
- }
+ /* But if we've exhausted the rhs, there is nothing to map
+ * to, except the special handling one, and we make the
+ * range the same size as the lhs one. */
+ r_cp = TR_SPECIAL_HANDLING;
+ r_range_count = t_range_count;
+ }
+ else {
+ if (! rstr_utf8) {
+ r_cp = *r;
+ r_range_count = 1;
+ r++;
+ }
+ else {
+ Size_t r_char_len;
+
+ r_cp = valid_utf8_to_uvchr(r, &r_char_len);
+ r += r_char_len;
+ if (r < rend && *r == RANGE_INDICATOR) {
+ r++;
+ r_range_count = valid_utf8_to_uvchr(r,
+ &r_char_len) - r_cp + 1;
+ r += r_char_len;
+ }
+ else {
+ r_range_count = 1;
+ }
+ }
- /* sort the ranges */
- qsort(cp, i, 2*sizeof(UV), uvcompare);
-
- /* Create a utf8 string containing the complement of the
- * codepoint ranges. For example if cp[] contains [A,B], [C,D],
- * then transv will contain the equivalent of:
- * join '', map chr, 0, RANGE_INDICATOR, A - 1,
- * B + 1, RANGE_INDICATOR, C - 1,
- * D + 1, RANGE_INDICATOR, 0x7fffffff;
- * A range of a single char skips the RANGE_INDICATOR and
- * end cp.
- */
- for (j = 0; j < i; j++) {
- UV val = cp[2*j];
- diff = val - nextmin;
- if (diff > 0) {
- t = uvchr_to_utf8(tmpbuf,nextmin);
- sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
- if (diff > 1) {
- U8 range_mark = RANGE_INDICATOR;
- t = uvchr_to_utf8(tmpbuf, val - 1);
- sv_catpvn(transv, (char *)&range_mark, 1);
- sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
- }
- }
- val = cp[2*j+1];
- if (val >= nextmin)
- nextmin = val + 1;
- }
+ if (r_cp == TR_SPECIAL_HANDLING) {
+ r_range_count = t_range_count;
+ }
- t = uvchr_to_utf8(tmpbuf,nextmin);
- sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
- {
- U8 range_mark = RANGE_INDICATOR;
- sv_catpvn(transv, (char *)&range_mark, 1);
- }
- t = uvchr_to_utf8(tmpbuf, 0x7fffffff);
- sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
- t = (const U8*)SvPVX_const(transv);
- tlen = SvCUR(transv);
- tend = t + tlen;
- Safefree(cp);
- }
- else if (!rlen && !del) {
- r = t; rlen = tlen; rend = tend;
- }
+ /* This is the final character so far */
+ final_map = r_cp + r_range_count - 1;
- if (!squash) {
- if ((!rlen && !del) || t == r ||
- (tlen == rlen && memEQ((char *)t, (char *)r, tlen)))
- {
- o->op_private |= OPpTRANS_IDENTICAL;
- }
- }
+ r_count += r_range_count;
+ }
+ }
- /* extract char ranges from t and r and append them to listsv */
-
- while (t < tend || tfirst <= tlast) {
- /* see if we need more "t" chars */
- if (tfirst > tlast) {
- tfirst = (I32)utf8n_to_uvchr(t, tend - t, &ulen, flags);
- t += ulen;
- if (t < tend && *t == RANGE_INDICATOR) { /* illegal utf8 val indicates range */
- t++;
- tlast = (I32)utf8n_to_uvchr(t, tend - t, &ulen, flags);
- t += ulen;
- }
- else
- tlast = tfirst;
- }
+ /* Here, we have the next things ready in both sides. They are
+ * potentially ranges. We try to process as big a chunk as
+ * possible at once, but the lhs and rhs must be synchronized, so
+ * things like tr/A-Z/a-ij-z/ will need to be processed in 2 chunks
+ * */
+ min_range_count = MIN(t_range_count, r_range_count);
+
+ /* Search the inversion list for the entry that contains the input
+ * code point <cp>. The inversion map was initialized to cover the
+ * entire range of possible inputs, so this should not fail. So
+ * the return value is the index into the list's array of the range
+ * that contains <cp>, that is, 'i' such that array[i] <= cp <
+ * array[i+1] */
+ i = _invlist_search(t_invlist, t_cp);
+ assert(i >= 0);
+
+ /* Here, the data structure might look like:
+ *
+ * index t r Meaning
+ * [i-1] J j # J-L => j-l
+ * [i] M -1 # M => default; as do N, O, P, Q
+ * [i+1] R x # R => x, S => x+1, T => x+2
+ * [i+2] U y # U => y, V => y+1, ...
+ * ...
+ * [-1] Z -1 # Z => default; as do Z+1, ... infinity
+ *
+ * where 'x' and 'y' above are not to be taken literally.
+ *
+ * The maximum chunk we can handle in this loop iteration, is the
+ * smallest of the three components: the lhs 't_', the rhs 'r_',
+ * and the remainder of the range in element [i]. (In pass 1, that
+ * range will have everything in it be of the same class; we can't
+ * cross into another class.) 'min_range_count' already contains
+ * the smallest of the first two values. The final one is
+ * irrelevant if the map is to the special indicator */
+
+ invmap_range_length_remaining = ((Size_t) i + 1 < len)
+ ? t_array[i+1] - t_cp
+ : IV_MAX - t_cp;
+ span = MAX(1, MIN(min_range_count, invmap_range_length_remaining));
+
+ /* The end point of this chunk is where we are, plus the span, but
+ * never larger than the platform's infinity */
+ t_cp_end = MIN(IV_MAX, t_cp + span - 1);
+
+ if (r_cp == TR_SPECIAL_HANDLING) {
+ r_cp_end = TR_SPECIAL_HANDLING;
+ }
+ else {
+ r_cp_end = MIN(IV_MAX, r_cp + span - 1);
+
+ /* If something on the lhs is below 256, and something on the
+ * rhs is above, there is a potential mapping here across that
+ * boundary. Indeed the only way there isn't is if both sides
+ * start at the same point. That means they both cross at the
+ * same time. But otherwise one crosses before the other */
+ if (t_cp < 256 && r_cp_end > 255 && r_cp != t_cp) {
+ can_force_utf8 = TRUE;
+ }
+ }
- /* now see if we need more "r" chars */
- if (rfirst > rlast) {
- if (r < rend) {
- rfirst = (I32)utf8n_to_uvchr(r, rend - r, &ulen, flags);
- r += ulen;
- if (r < rend && *r == RANGE_INDICATOR) { /* illegal utf8 val indicates range */
- r++;
- rlast = (I32)utf8n_to_uvchr(r, rend - r, &ulen, flags);
- r += ulen;
- }
- else
- rlast = rfirst;
- }
- else {
- if (!havefinal++)
- final = rlast;
- rfirst = rlast = 0xffffffff;
- }
- }
+ /* If a character appears in the search list more than once, the
+ * 2nd and succeeding occurrences are ignored, so only do this
+ * range if haven't already processed this character. (The range
+ * has been set up so that all members in it will be of the same
+ * ilk) */
+ if (r_map[i] == TR_UNLISTED) {
+
+ /* This is the first definition for this chunk, hence is valid
+ * and needs to be processed. Here and in the comments below,
+ * we use the above sample data. The t_cp chunk must be any
+ * contiguous subset of M, N, O, P, and/or Q.
+ *
+ * In the first pass, the t_invlist has been partitioned so
+ * that all elements in any single range have the same number
+ * of bytes in their UTF-8 representations. And the r space is
+ * either a single byte, or a range of strictly monotonically
+ * increasing code points. So the final element in the range
+ * will be represented by no fewer bytes than the initial one.
+ * That means that if the final code point in the t range has
+ * at least as many bytes as the final code point in the r,
+ * then all code points in the t range have at least as many
+ * bytes as their corresponding r range element. But if that's
+ * not true, the transliteration of at least the final code
+ * point grows in length. As an example, suppose we had
+ * tr/\x{fff0}-\x{fff1}/\x{ffff}-\x{10000}/
+ * The UTF-8 for all but 10000 occupies 3 bytes on ASCII
+ * platforms. We have deliberately set up the data structure
+ * so that any range in the lhs gets split into chunks for
+ * processing, such that every code point in a chunk has the
+ * same number of UTF-8 bytes. We only have to check the final
+ * code point in the rhs against any code point in the lhs. */
+ if ( ! pass2
+ && r_cp_end != TR_SPECIAL_HANDLING
+ && UVCHR_SKIP(t_cp_end) < UVCHR_SKIP(r_cp_end))
+ {
+ NV ratio = UVCHR_SKIP(r_cp_end) / UVCHR_SKIP(t_cp);
- /* now see which range will peter out first, if either. */
- tdiff = tlast - tfirst;
- rdiff = rlast - rfirst;
- tcount += tdiff + 1;
- rcount += rdiff + 1;
+ o->op_private |= OPpTRANS_GROWS;
- if (tdiff <= rdiff)
- diff = tdiff;
- else
- diff = rdiff;
+ /* Now that we know it grows, we can keep track of the
+ * largest ratio */
+ if (ratio > max_expansion) {
+ max_expansion = ratio;
+ }
+ }
- if (rfirst == 0xffffffff) {
- diff = tdiff; /* oops, pretend rdiff is infinite */
- if (diff > 0)
- Perl_sv_catpvf(aTHX_ listsv, "%04lx\t%04lx\tXXXX\n",
- (long)tfirst, (long)tlast);
- else
- Perl_sv_catpvf(aTHX_ listsv, "%04lx\t\tXXXX\n", (long)tfirst);
- }
- else {
- if (diff > 0)
- Perl_sv_catpvf(aTHX_ listsv, "%04lx\t%04lx\t%04lx\n",
- (long)tfirst, (long)(tfirst + diff),
- (long)rfirst);
- else
- Perl_sv_catpvf(aTHX_ listsv, "%04lx\t\t%04lx\n",
- (long)tfirst, (long)rfirst);
-
- if (rfirst + diff > max)
- max = rfirst + diff;
- if (!grows)
- grows = (tfirst < rfirst &&
- UVCHR_SKIP(tfirst) < UVCHR_SKIP(rfirst + diff));
- rfirst += diff + 1;
- }
- tfirst += diff + 1;
- }
+ /* The very first range is marked as adjacent to the
+ * non-existent range below it, as it causes things to "just
+ * work" (TradeMark)
+ *
+ * If the lowest code point in this chunk is M, it adjoins the
+ * J-L range */
+ if (t_cp == t_array[i]) {
+ adjacent_to_range_below = TRUE;
+
+ /* And if the map has the same offset from the beginning of
+ * the range as does this new code point (or both are for
+ * TR_SPECIAL_HANDLING), this chunk can be completely
+ * merged with the range below. EXCEPT, in the first pass,
+ * we don't merge ranges whose UTF-8 byte representations
+ * have different lengths, so that we can more easily
+ * detect if a replacement is longer than the source, that
+ * is if it 'grows'. But in the 2nd pass, there's no
+ * reason to not merge */
+ if ( (i > 0 && ( pass2
+ || UVCHR_SKIP(t_array[i-1])
+ == UVCHR_SKIP(t_cp)))
+ && ( ( r_cp == TR_SPECIAL_HANDLING
+ && r_map[i-1] == TR_SPECIAL_HANDLING)
+ || ( r_cp != TR_SPECIAL_HANDLING
+ && r_cp - r_map[i-1] == t_cp - t_array[i-1])))
+ {
+ merge_with_range_below = TRUE;
+ }
+ }
- /* compile listsv into a swash and attach to o */
+ /* Similarly, if the highest code point in this chunk is 'Q',
+ * it adjoins the range above, and if the map is suitable, can
+ * be merged with it */
+ if ( t_cp_end >= IV_MAX - 1
+ || ( (Size_t) i + 1 < len
+ && (Size_t) t_cp_end + 1 == t_array[i+1]))
+ {
+ adjacent_to_range_above = TRUE;
+ if ((Size_t) i + 1 < len)
+ if ( ( pass2
+ || UVCHR_SKIP(t_cp) == UVCHR_SKIP(t_array[i+1]))
+ && ( ( r_cp == TR_SPECIAL_HANDLING
+ && r_map[i+1] == (UV) TR_SPECIAL_HANDLING)
+ || ( r_cp != TR_SPECIAL_HANDLING
+ && r_cp_end == r_map[i+1] - 1)))
+ {
+ merge_with_range_above = TRUE;
+ }
+ }
- none = ++max;
- if (del)
- ++max;
+ if (merge_with_range_below && merge_with_range_above) {
+
+ /* Here the new chunk looks like M => m, ... Q => q; and
+ * the range above is like R => r, .... Thus, the [i-1]
+ * and [i+1] ranges should be seamlessly melded so the
+ * result looks like
+ *
+ * [i-1] J j # J-T => j-t
+ * [i] U y # U => y, V => y+1, ...
+ * ...
+ * [-1] Z -1 # Z => default; as do Z+1, ... infinity
+ */
+ Move(t_array + i + 2, t_array + i, len - i - 2, UV);
+ Move(r_map + i + 2, r_map + i, len - i - 2, UV);
+ len -= 2;
+ invlist_set_len(t_invlist,
+ len,
+ *(get_invlist_offset_addr(t_invlist)));
+ }
+ else if (merge_with_range_below) {
+
+ /* Here the new chunk looks like M => m, .... But either
+ * (or both) it doesn't extend all the way up through Q; or
+ * the range above doesn't start with R => r. */
+ if (! adjacent_to_range_above) {
+
+ /* In the first case, let's say the new chunk extends
+ * through O. We then want:
+ *
+ * [i-1] J j # J-O => j-o
+ * [i] P -1 # P => -1, Q => -1
+ * [i+1] R x # R => x, S => x+1, T => x+2
+ * [i+2] U y # U => y, V => y+1, ...
+ * ...
+ * [-1] Z -1 # Z => default; as do Z+1, ...
+ * infinity
+ */
+ t_array[i] = t_cp_end + 1;
+ r_map[i] = TR_UNLISTED;
+ }
+ else { /* Adjoins the range above, but can't merge with it
+ (because 'x' is not the next map after q) */
+ /*
+ * [i-1] J j # J-Q => j-q
+ * [i] R x # R => x, S => x+1, T => x+2
+ * [i+1] U y # U => y, V => y+1, ...
+ * ...
+ * [-1] Z -1 # Z => default; as do Z+1, ...
+ * infinity
+ */
- if (max > 0xffff)
- bits = 32;
- else if (max > 0xff)
- bits = 16;
- else
- bits = 8;
+ Move(t_array + i + 1, t_array + i, len - i - 1, UV);
+ Move(r_map + i + 1, r_map + i, len - i - 1, UV);
+ len--;
+ invlist_set_len(t_invlist, len,
+ *(get_invlist_offset_addr(t_invlist)));
+ }
+ }
+ else if (merge_with_range_above) {
+
+ /* Here the new chunk ends with Q => q, and the range above
+ * must start with R => r, so the two can be merged. But
+ * either (or both) the new chunk doesn't extend all the
+ * way down to M; or the mapping of the final code point
+ * range below isn't m */
+ if (! adjacent_to_range_below) {
+
+ /* In the first case, let's assume the new chunk starts
+ * with P => p. Then, because it's merge-able with the
+ * range above, that range must be R => r. We want:
+ *
+ * [i-1] J j # J-L => j-l
+ * [i] M -1 # M => -1, N => -1
+ * [i+1] P p # P-T => p-t
+ * [i+2] U y # U => y, V => y+1, ...
+ * ...
+ * [-1] Z -1 # Z => default; as do Z+1, ...
+ * infinity
+ */
+ t_array[i+1] = t_cp;
+ r_map[i+1] = r_cp;
+ }
+ else { /* Adjoins the range below, but can't merge with it
+ */
+ /*
+ * [i-1] J j # J-L => j-l
+ * [i] M x # M-T => x-5 .. x+2
+ * [i+1] U y # U => y, V => y+1, ...
+ * ...
+ * [-1] Z -1 # Z => default; as do Z+1, ...
+ * infinity
+ */
+ Move(t_array + i + 1, t_array + i, len - i - 1, UV);
+ Move(r_map + i + 1, r_map + i, len - i - 1, UV);
+ len--;
+ t_array[i] = t_cp;
+ r_map[i] = r_cp;
+ invlist_set_len(t_invlist, len,
+ *(get_invlist_offset_addr(t_invlist)));
+ }
+ }
+ else if (adjacent_to_range_below && adjacent_to_range_above) {
+ /* The new chunk completely fills the gap between the
+ * ranges on either side, but can't merge with either of
+ * them.
+ *
+ * [i-1] J j # J-L => j-l
+ * [i] M z # M => z, N => z+1 ... Q => z+4
+ * [i+1] R x # R => x, S => x+1, T => x+2
+ * [i+2] U y # U => y, V => y+1, ...
+ * ...
+ * [-1] Z -1 # Z => default; as do Z+1, ... infinity
+ */
+ r_map[i] = r_cp;
+ }
+ else if (adjacent_to_range_below) {
+ /* The new chunk adjoins the range below, but not the range
+ * above, and can't merge. Let's assume the chunk ends at
+ * O.
+ *
+ * [i-1] J j # J-L => j-l
+ * [i] M z # M => z, N => z+1, O => z+2
+ * [i+1] P -1 # P => -1, Q => -1
+ * [i+2] R x # R => x, S => x+1, T => x+2
+ * [i+3] U y # U => y, V => y+1, ...
+ * ...
+ * [-w] Z -1 # Z => default; as do Z+1, ... infinity
+ */
+ invlist_extend(t_invlist, len + 1);
+ t_array = invlist_array(t_invlist);
+ Renew(r_map, len + 1, UV);
+
+ Move(t_array + i + 1, t_array + i + 2, len - i - 1, UV);
+ Move(r_map + i + 1, r_map + i + 2, len - i - 1, UV);
+ r_map[i] = r_cp;
+ t_array[i+1] = t_cp_end + 1;
+ r_map[i+1] = TR_UNLISTED;
+ len++;
+ invlist_set_len(t_invlist, len,
+ *(get_invlist_offset_addr(t_invlist)));
+ }
+ else if (adjacent_to_range_above) {
+ /* The new chunk adjoins the range above, but not the range
+ * below, and can't merge. Let's assume the new chunk
+ * starts at O
+ *
+ * [i-1] J j # J-L => j-l
+ * [i] M -1 # M => default, N => default
+ * [i+1] O z # O => z, P => z+1, Q => z+2
+ * [i+2] R x # R => x, S => x+1, T => x+2
+ * [i+3] U y # U => y, V => y+1, ...
+ * ...
+ * [-1] Z -1 # Z => default; as do Z+1, ... infinity
+ */
+ invlist_extend(t_invlist, len + 1);
+ t_array = invlist_array(t_invlist);
+ Renew(r_map, len + 1, UV);
+
+ Move(t_array + i + 1, t_array + i + 2, len - i - 1, UV);
+ Move(r_map + i + 1, r_map + i + 2, len - i - 1, UV);
+ t_array[i+1] = t_cp;
+ r_map[i+1] = r_cp;
+ len++;
+ invlist_set_len(t_invlist, len,
+ *(get_invlist_offset_addr(t_invlist)));
+ }
+ else {
+ /* The new chunk adjoins neither the range above, nor the
+ * range below. Lets assume it is N..P => n..p
+ *
+ * [i-1] J j # J-L => j-l
+ * [i] M -1 # M => default
+ * [i+1] N n # N..P => n..p
+ * [i+2] Q -1 # Q => default
+ * [i+3] R x # R => x, S => x+1, T => x+2
+ * [i+4] U y # U => y, V => y+1, ...
+ * ...
+ * [-1] Z -1 # Z => default; as do Z+1, ... infinity
+ */
- swash = MUTABLE_SV(swash_init("utf8", "", listsv, bits, none));
-#ifdef USE_ITHREADS
- cPADOPo->op_padix = pad_alloc(OP_TRANS, SVf_READONLY);
- SvREFCNT_dec(PAD_SVl(cPADOPo->op_padix));
- PAD_SETSV(cPADOPo->op_padix, swash);
- SvPADTMP_on(swash);
- SvREADONLY_on(swash);
-#else
- cSVOPo->op_sv = swash;
-#endif
- SvREFCNT_dec(listsv);
- SvREFCNT_dec(transv);
+ invlist_extend(t_invlist, len + 2);
+ t_array = invlist_array(t_invlist);
+ Renew(r_map, len + 2, UV);
+
+ Move(t_array + i + 1,
+ t_array + i + 2 + 1, len - i - (2 - 1), UV);
+ Move(r_map + i + 1,
+ r_map + i + 2 + 1, len - i - (2 - 1), UV);
+
+ len += 2;
+ invlist_set_len(t_invlist, len,
+ *(get_invlist_offset_addr(t_invlist)));
+
+ t_array[i+1] = t_cp;
+ r_map[i+1] = r_cp;
+
+ t_array[i+2] = t_cp_end + 1;
+ r_map[i+2] = TR_UNLISTED;
+ }
+ } /* End of this chunk needs to be processed */
+
+ /* Done with this chunk. */
+ t_cp += span;
+ if (t_cp >= IV_MAX) {
+ break;
+ }
+ t_range_count -= span;
+ if (r_cp != TR_SPECIAL_HANDLING) {
+ r_cp += span;
+ r_range_count -= span;
+ }
+ else {
+ r_range_count = 0;
+ }
+
+ } /* End of loop through the search list */
+
+ /* We don't need an exact count, but we do need to know if there is
+ * anything left over in the replacement list. So, just assume it's
+ * one byte per character */
+ if (rend > r) {
+ r_count++;
+ }
+ } /* End of passes */
+
+ SvREFCNT_dec(inverted_tstr);
- if (!del && havefinal && rlen)
- (void)hv_store(MUTABLE_HV(SvRV(swash)), "FINAL", 5,
- newSVuv((UV)final), 0);
+ /* We now have normalized the input into an inversion map.
+ *
+ * See if the lhs and rhs are equivalent. If so, this tr/// is a no-op
+ * except for the count, and streamlined runtime code can be used */
+ if (!del && !squash) {
+
+ /* They are identical if they point to same address, or if everything
+ * maps to UNLISTED or to itself. This catches things that not looking
+ * at the normalized inversion map doesn't catch, like tr/aa/ab/ or
+ * tr/\x{100}-\x{104}/\x{100}-\x{102}\x{103}-\x{104} */
+ if (r0 != t0) {
+ for (i = 0; i < len; i++) {
+ if (r_map[i] != TR_UNLISTED && r_map[i] != t_array[i]) {
+ goto done_identical_check;
+ }
+ }
+ }
+
+ /* Here have gone through entire list, and didn't find any
+ * non-identical mappings */
+ o->op_private |= OPpTRANS_IDENTICAL;
+
+ done_identical_check: ;
+ }
+
+ t_array = invlist_array(t_invlist);
+
+ /* If has components above 255, we generally need to use the inversion map
+ * implementation */
+ if ( can_force_utf8
+ || ( len > 0
+ && t_array[len-1] > 255
+ /* If the final range is 0x100-INFINITY and is a special
+ * mapping, the table implementation can handle it */
+ && ! ( t_array[len-1] == 256
+ && ( r_map[len-1] == TR_UNLISTED
+ || r_map[len-1] == TR_SPECIAL_HANDLING))))
+ {
+ SV* r_map_sv;
+
+ /* A UTF-8 op is generated, indicated by this flag. This op is an
+ * sv_op */
+ o->op_private |= OPpTRANS_USE_SVOP;
+
+ if (can_force_utf8) {
+ o->op_private |= OPpTRANS_CAN_FORCE_UTF8;
+ }
+
+ /* The inversion map is pushed; first the list. */
+ invmap = MUTABLE_AV(newAV());
+ av_push(invmap, t_invlist);
- Safefree(tsave);
- Safefree(rsave);
+ /* 2nd is the mapping */
+ r_map_sv = newSVpvn((char *) r_map, len * sizeof(UV));
+ av_push(invmap, r_map_sv);
- tlen = tcount;
- rlen = rcount;
- if (r < rend)
- rlen++;
- else if (rlast == 0xffffffff)
- rlen = 0;
+ /* 3rd is the max possible expansion factor */
+ av_push(invmap, newSVnv(max_expansion));
+
+ /* Characters that are in the search list, but not in the replacement
+ * list are mapped to the final character in the replacement list */
+ if (! del && r_count < t_count) {
+ av_push(invmap, newSVuv(final_map));
+ }
+
+#ifdef USE_ITHREADS
+ cPADOPo->op_padix = pad_alloc(OP_TRANS, SVf_READONLY);
+ SvREFCNT_dec(PAD_SVl(cPADOPo->op_padix));
+ PAD_SETSV(cPADOPo->op_padix, (SV *) invmap);
+ SvPADTMP_on(invmap);
+ SvREADONLY_on(invmap);
+#else
+ cSVOPo->op_sv = (SV *) invmap;
+#endif
- goto warnins;
}
+ else {
+ OPtrans_map *tbl;
+ Size_t i;
+
+ /* The OPtrans_map struct already contains one slot; hence the -1. */
+ SSize_t struct_size = sizeof(OPtrans_map)
+ + (256 - 1 + 1)*sizeof(short);
/* Non-utf8 case: set o->op_pv to point to a simple 256+ entry lookup
* table. Entries with the value TR_UNMAPPED indicate chars not to be
* translated, while TR_DELETE indicates a search char without a
* corresponding replacement char under /d.
*
- * Normally, the table has 256 slots. However, in the presence of
- * /c, the search charlist has an implicit \x{100}-\x{7fffffff}
- * added, and if there are enough replacement chars to start pairing
- * with the \x{100},... search chars, then a larger (> 256) table
- * is allocated.
- *
- * In addition, regardless of whether under /c, an extra slot at the
- * end is used to store the final repeating char, or TR_R_EMPTY under an
- * empty replacement list, or TR_DELETE under /d; which makes the
- * runtime code easier.
- *
- * The toker will have already expanded char ranges in t and r.
+ * In addition, an extra slot at the end is used to store the final
+ * repeating char, or TR_R_EMPTY under an empty replacement list, or
+ * TR_DELETE under /d; which makes the runtime code easier.
*/
- /* Initially allocate 257-slot table: 256 for basic (non /c) usage,
- * plus final slot for repeat/TR_DELETE/TR_R_EMPTY. Later we realloc if
- * excess > * 0. The OPtrans_map struct already contains one slot;
- * hence the -1.
- */
- struct_size = sizeof(OPtrans_map) + (256 - 1 + 1)*sizeof(short);
+ /* Indicate this is an op_pv */
+ o->op_private &= ~OPpTRANS_USE_SVOP;
+
tbl = (OPtrans_map*)PerlMemShared_calloc(struct_size, 1);
tbl->size = 256;
cPVOPo->op_pv = (char*)tbl;
- if (complement) {
- Size_t excess;
+ for (i = 0; i < len; i++) {
+ STATIC_ASSERT_DECL(TR_SPECIAL_HANDLING == TR_DELETE);
+ short upper = i >= len - 1 ? 256 : t_array[i+1];
+ short to = r_map[i];
+ short j;
+ bool do_increment = TRUE;
- /* in this branch, j is a count of 'consumed' (i.e. paired off
- * with a search char) replacement chars (so j <= rlen always)
- */
- for (i = 0; i < tlen; i++)
- tbl->map[t[i]] = (short) TR_UNMAPPED;
-
- for (i = 0, j = 0; i < 256; i++) {
- if (!tbl->map[i]) {
- if (j == rlen) {
- if (del)
- tbl->map[i] = (short) TR_DELETE;
- else if (rlen)
- tbl->map[i] = r[j-1];
- else
- tbl->map[i] = (short)i;
- }
- else {
- tbl->map[i] = r[j++];
- }
- if ( tbl->map[i] >= 0
- && UVCHR_IS_INVARIANT((UV)i)
- && !UVCHR_IS_INVARIANT((UV)(tbl->map[i]))
- )
- grows = TRUE;
- }
+ /* Any code points above our limit should be irrelevant */
+ if (t_array[i] >= tbl->size) break;
+
+ /* Set up the map */
+ if (to == (short) TR_SPECIAL_HANDLING && ! del) {
+ to = final_map;
+ do_increment = FALSE;
+ }
+ else if (to < 0) {
+ do_increment = FALSE;
}
- ASSUME(j <= rlen);
- excess = rlen - j;
+ /* Create a map for everything in this range. The value increases
+ * except for the special cases */
+ for (j = t_array[i]; j < upper; j++) {
+ tbl->map[j] = to;
+ if (do_increment) to++;
+ }
+ }
- if (excess) {
- /* More replacement chars than search chars:
- * store excess replacement chars at end of main table.
- */
+ tbl->map[tbl->size] = del
+ ? (short) TR_DELETE
+ : rlen
+ ? final_map
+ : (short) TR_R_EMPTY;
- struct_size += excess * sizeof(short);
- tbl = (OPtrans_map*)PerlMemShared_realloc(tbl, struct_size);
- tbl->size += excess;
- cPVOPo->op_pv = (char*)tbl;
+ SvREFCNT_dec(t_invlist);
- for (i = 0; i < excess; i++)
- tbl->map[i + 256] = r[j+i];
- }
- else {
- /* no more replacement chars than search chars */
- if (!rlen && !del && !squash)
- o->op_private |= OPpTRANS_IDENTICAL;
- }
+#if 0 /* code that added excess above-255 chars at the end of the table, in
+ case we ever want to not use the inversion map implementation for
+ this */
- tbl->map[tbl->size] = del
- ? (short) TR_DELETE
- : rlen
- ? r[rlen - 1]
- : (short) TR_R_EMPTY;
+ ASSUME(j <= rlen);
+ excess = rlen - j;
+
+ if (excess) {
+ /* More replacement chars than search chars:
+ * store excess replacement chars at end of main table.
+ */
+
+ struct_size += excess;
+ tbl = (OPtrans_map*)PerlMemShared_realloc(tbl,
+ struct_size + excess * sizeof(short));
+ tbl->size += excess;
+ cPVOPo->op_pv = (char*)tbl;
+
+ for (i = 0; i < excess; i++)
+ tbl->map[i + 256] = r[j+i];
}
else {
- if (!rlen && !del) {
- r = t; rlen = tlen;
- if (!squash)
- o->op_private |= OPpTRANS_IDENTICAL;
- }
- else if (!squash && rlen == tlen && memEQ((char*)t, (char*)r, tlen)) {
- o->op_private |= OPpTRANS_IDENTICAL;
- }
+ /* no more replacement chars than search chars */
+#endif
- for (i = 0; i < 256; i++)
- tbl->map[i] = (short) TR_UNMAPPED;
- for (i = 0, j = 0; i < tlen; i++,j++) {
- if (j >= rlen) {
- if (del) {
- if (tbl->map[t[i]] == (short) TR_UNMAPPED)
- tbl->map[t[i]] = (short) TR_DELETE;
- continue;
- }
- --j;
- }
- if (tbl->map[t[i]] == (short) TR_UNMAPPED) {
- if ( UVCHR_IS_INVARIANT(t[i])
- && ! UVCHR_IS_INVARIANT(r[j]))
- grows = TRUE;
- tbl->map[t[i]] = r[j];
- }
- }
- tbl->map[tbl->size] = del
- ? (short) TR_UNMAPPED
- : rlen
- ? (short) TR_UNMAPPED
- : (short) TR_R_EMPTY;
- }
+ }
- /* both non-utf8 and utf8 code paths end up here */
+ Safefree(r_map);
- warnins:
- if(del && rlen == tlen) {
+ if(del && rlen != 0 && r_count == t_count) {
Perl_ck_warner(aTHX_ packWARN(WARN_MISC), "Useless use of /d modifier in transliteration operator");
- } else if(rlen > tlen && !complement) {
+ } else if(r_count > t_count) {
Perl_ck_warner(aTHX_ packWARN(WARN_MISC), "Replacement list is longer than search list");
}
- if (grows)
- o->op_private |= OPpTRANS_GROWS;
op_free(expr);
op_free(repl);
diff --git a/op.h b/op.h
index 3781960e19..537b2ef0d3 100644
--- a/op.h
+++ b/op.h
@@ -1110,7 +1110,7 @@ C<sib> is non-null. For a higher-level interface, see C<L</op_sibling_splice>>.
"Use of strings with code points over 0xFF as arguments to " \
"%s operator is not allowed"
#endif
-#if defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C)
+#if defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C) || defined(PERL_IN_PERL_C)
# define TR_UNMAPPED (UV)-1
# define TR_DELETE (UV)-2
# define TR_R_EMPTY (UV)-3 /* rhs (replacement) is empty */
diff --git a/proto.h b/proto.h
index 20e4b0e511..51c316e070 100644
--- a/proto.h
+++ b/proto.h
@@ -4859,31 +4859,26 @@ STATIC Size_t S_do_trans_complex(pTHX_ SV * const sv, const OPtrans_map * const
#define PERL_ARGS_ASSERT_DO_TRANS_COMPLEX \
assert(sv); assert(tbl)
-STATIC Size_t S_do_trans_complex_utf8(pTHX_ SV * const sv)
- __attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_DO_TRANS_COMPLEX_UTF8 \
- assert(sv)
-
STATIC Size_t S_do_trans_count(pTHX_ SV * const sv, const OPtrans_map * const tbl)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_DO_TRANS_COUNT \
assert(sv); assert(tbl)
-STATIC Size_t S_do_trans_count_utf8(pTHX_ SV * const sv)
+STATIC Size_t S_do_trans_count_invmap(pTHX_ SV * const sv, AV * const map)
__attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_DO_TRANS_COUNT_UTF8 \
- assert(sv)
+#define PERL_ARGS_ASSERT_DO_TRANS_COUNT_INVMAP \
+ assert(sv); assert(map)
+
+STATIC Size_t S_do_trans_invmap(pTHX_ SV * const sv, AV * const map)
+ __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_DO_TRANS_INVMAP \
+ assert(sv); assert(map)
STATIC Size_t S_do_trans_simple(pTHX_ SV * const sv, const OPtrans_map * const tbl)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_DO_TRANS_SIMPLE \
assert(sv); assert(tbl)
-STATIC Size_t S_do_trans_simple_utf8(pTHX_ SV * const sv)
- __attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_DO_TRANS_SIMPLE_UTF8 \
- assert(sv)
-
#endif
#if defined(PERL_IN_DUMP_C)
STATIC CV* S_deb_curcv(pTHX_ I32 ix);
@@ -5540,12 +5535,6 @@ STATIC SV * S_space_join_names_mortal(pTHX_ char *const *array);
STATIC void S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist);
#define PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS \
assert(pRExC_state); assert(invlist)
-#ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE SV* S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp)
- __attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_ADD_CP_TO_INVLIST
-#endif
-
STATIC U32 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_ADD_DATA \
@@ -5582,13 +5571,6 @@ STATIC SV * S_get_ANYOFM_contents(pTHX_ const regnode * n)
STATIC SV* S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, const regnode_charclass* const node);
#define PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC \
assert(pRExC_state); assert(node)
-#ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE STRLEN* S_get_invlist_iter_addr(SV* invlist)
- __attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR \
- assert(invlist)
-#endif
-
STATIC bool S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode_offset* nodep, UV *code_point_p, int* cp_count, I32 *flagp, const bool strict, const U32 depth);
#define PERL_ARGS_ASSERT_GROK_BSLASH_N \
assert(pRExC_state); assert(flagp)
@@ -5614,46 +5596,12 @@ PERL_STATIC_INLINE SV* S_invlist_contents(pTHX_ SV* const invlist, const bool tr
#endif
#ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE void S_invlist_extend(pTHX_ SV* const invlist, const UV len);
-#define PERL_ARGS_ASSERT_INVLIST_EXTEND \
- assert(invlist)
-#endif
-#ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE UV S_invlist_highest(SV* const invlist)
- __attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_INVLIST_HIGHEST \
- assert(invlist)
-#endif
-
-#ifndef PERL_NO_INLINE_FUNCTIONS
PERL_STATIC_INLINE bool S_invlist_is_iterating(SV* const invlist)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_INVLIST_IS_ITERATING \
assert(invlist)
#endif
-#ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE void S_invlist_iterfinish(SV* invlist);
-#define PERL_ARGS_ASSERT_INVLIST_ITERFINISH \
- assert(invlist)
-#endif
-#ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE void S_invlist_iterinit(SV* invlist);
-#define PERL_ARGS_ASSERT_INVLIST_ITERINIT \
- assert(invlist)
-#endif
-#ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE bool S_invlist_iternext(SV* invlist, UV* start, UV* end)
- __attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_INVLIST_ITERNEXT \
- assert(invlist); assert(start); assert(end)
-#endif
-
-#ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE void S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset);
-#define PERL_ARGS_ASSERT_INVLIST_SET_LEN \
- assert(invlist)
-#endif
STATIC bool S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc);
#define PERL_ARGS_ASSERT_IS_SSC_WORTH_IT \
assert(pRExC_state); assert(ssc)
@@ -5811,6 +5759,55 @@ PERL_CALLCONV void Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char*
#define PERL_ARGS_ASSERT__INVLIST_DUMP \
assert(file); assert(indent); assert(invlist)
#endif
+#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C)
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE SV* S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp)
+ __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_ADD_CP_TO_INVLIST
+#endif
+
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE STRLEN* S_get_invlist_iter_addr(SV* invlist)
+ __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR \
+ assert(invlist)
+#endif
+
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE void S_invlist_extend(pTHX_ SV* const invlist, const UV len);
+#define PERL_ARGS_ASSERT_INVLIST_EXTEND \
+ assert(invlist)
+#endif
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE UV S_invlist_highest(SV* const invlist)
+ __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_INVLIST_HIGHEST \
+ assert(invlist)
+#endif
+
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE void S_invlist_iterfinish(SV* invlist);
+#define PERL_ARGS_ASSERT_INVLIST_ITERFINISH \
+ assert(invlist)
+#endif
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE void S_invlist_iterinit(SV* invlist);
+#define PERL_ARGS_ASSERT_INVLIST_ITERINIT \
+ assert(invlist)
+#endif
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE bool S_invlist_iternext(SV* invlist, UV* start, UV* end)
+ __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_INVLIST_ITERNEXT \
+ assert(invlist); assert(start); assert(end)
+#endif
+
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE void S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset);
+#define PERL_ARGS_ASSERT_INVLIST_SET_LEN \
+ assert(invlist)
+#endif
+#endif
#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_PERL_C) || defined(PERL_IN_UTF8_C)
PERL_CALLCONV bool Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b);
#define PERL_ARGS_ASSERT__INVLISTEQ \
@@ -5832,7 +5829,7 @@ PERL_CALLCONV void Perl_regprop(pTHX_ const regexp *prog, SV* sv, const regnode*
#define PERL_ARGS_ASSERT_REGPROP \
assert(sv); assert(o)
#endif
-#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C)
+#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C)
#ifndef PERL_NO_INLINE_FUNCTIONS
PERL_STATIC_INLINE bool S__invlist_contains_cp(SV* const invlist, const UV cp)
__attribute__warn_unused_result__;
diff --git a/toke.c b/toke.c
index 0fdcadc78c..2c448eb6de 100644
--- a/toke.c
+++ b/toke.c
@@ -2905,12 +2905,12 @@ S_scan_const(pTHX_ char *start)
bool dorange = FALSE; /* are we in a translit range? */
bool didrange = FALSE; /* did we just finish a range? */
bool in_charclass = FALSE; /* within /[...]/ */
- bool d_is_utf8 = FALSE; /* Output constant is UTF8 */
bool s_is_utf8 = cBOOL(UTF); /* Is the source string assumed to be
UTF8? But, this can show as true
when the source isn't utf8, as for
example when it is entirely composed
of hex constants */
+ bool d_is_utf8 = FALSE; /* Output constant is UTF8 */
STRLEN utf8_variant_count = 0; /* When not in UTF-8, this counts the
number of characters found so far
that will expand (into 2 bytes)
@@ -2951,11 +2951,6 @@ S_scan_const(pTHX_ char *start)
PERL_ARGS_ASSERT_SCAN_CONST;
assert(PL_lex_inwhat != OP_TRANSR);
- if (PL_lex_inwhat == OP_TRANS && PL_parser->lex_sub_op) {
- /* If we are doing a trans and we know we want UTF8, set expectation */
- d_is_utf8 = PL_parser->lex_sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
- s_is_utf8 = PL_parser->lex_sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
- }
/* Protect sv from errors and fatal warnings. */
ENTER_with_name("scan_const");
@@ -3646,13 +3641,6 @@ S_scan_const(pTHX_ char *start)
}
d = (char*)uvchr_to_utf8((U8*)d, uv);
- if (PL_lex_inwhat == OP_TRANS
- && PL_parser->lex_sub_op)
- {
- PL_parser->lex_sub_op->op_private |=
- (PL_lex_repl ? OPpTRANS_FROM_UTF
- : OPpTRANS_TO_UTF);
- }
}
}
#ifdef EBCDIC
@@ -4133,10 +4121,6 @@ S_scan_const(pTHX_ char *start)
SvPOK_on(sv);
if (d_is_utf8) {
SvUTF8_on(sv);
- if (PL_lex_inwhat == OP_TRANS && PL_parser->lex_sub_op) {
- PL_parser->lex_sub_op->op_private |=
- (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
- }
}
/* shrink the sv if we allocated more than we used */
@@ -10297,9 +10281,7 @@ S_scan_trans(pTHX_ char *start)
o = newPVOP(nondestruct ? OP_TRANSR : OP_TRANS, 0, (char*)NULL);
o->op_private &= ~OPpTRANS_ALL;
- o->op_private |= del|squash|complement|
- (DO_UTF8(PL_lex_stuff)? OPpTRANS_FROM_UTF : 0)|
- (DO_UTF8(PL_parser->lex_sub_repl) ? OPpTRANS_TO_UTF : 0);
+ o->op_private |= del|squash|complement;
PL_lex_op = o;
pl_yylval.ival = nondestruct ? OP_TRANSR : OP_TRANS;