diff options
-rw-r--r-- | doop.c | 506 | ||||
-rw-r--r-- | dump.c | 13 | ||||
-rw-r--r-- | embed.fnc | 10 | ||||
-rw-r--r-- | embed.h | 25 | ||||
-rw-r--r-- | invlist_inline.h | 5 | ||||
-rw-r--r-- | lib/B/Deparse.pm | 163 | ||||
-rw-r--r-- | op.c | 1310 | ||||
-rw-r--r-- | op.h | 2 | ||||
-rw-r--r-- | proto.h | 119 | ||||
-rw-r--r-- | toke.c | 22 |
10 files changed, 1341 insertions, 834 deletions
@@ -22,6 +22,7 @@ #include "EXTERN.h" #define PERL_IN_DOOP_C #include "perl.h" +#include "invlist_inline.h" #ifndef PERL_MICRO #include <signal.h> @@ -297,328 +298,240 @@ S_do_trans_complex(pTHX_ SV * const sv, const OPtrans_map * const tbl) /* Helper function for do_trans(). - * Handles utf8 cases(*) not involving the /c, /d, /s flags, - * and where search and replacement charlists aren't identical. - * (*) i.e. where the search or replacement charlists are utf8. sv may - * or may not be utf8. + * Handles cases where an inversion map implementation is to be used and the + * search and replacement charlists are identical: so the string isn't + * modified, and only a count of modifiable chars is needed. + * + * Note that it doesn't handle /d nor /s, since these modify the string + * even if the replacement charlist is empty. + * + * sv may or may not be utf8. */ STATIC Size_t -S_do_trans_simple_utf8(pTHX_ SV * const sv) +S_do_trans_count_invmap(pTHX_ SV * const sv, AV * const invmap) { U8 *s; U8 *send; - U8 *d; - U8 *start; - U8 *dstart, *dend; Size_t matches = 0; - const bool grows = cBOOL(PL_op->op_private & OPpTRANS_GROWS); STRLEN len; - SV* const rv = -#ifdef USE_ITHREADS - PAD_SVl(cPADOP->op_padix); -#else - MUTABLE_SV(cSVOP->op_sv); -#endif - HV* const hv = MUTABLE_HV(SvRV(rv)); - SV* const * svp = hv_fetchs(hv, "NONE", FALSE); - const UV none = svp ? SvUV(*svp) : 0x7fffffff; - const UV extra = none + 1; - UV final = 0; - U8 hibit = 0; + SV** const from_invlist_ptr = av_fetch(invmap, 0, TRUE); + SV** const to_invmap_ptr = av_fetch(invmap, 1, TRUE); + SV* from_invlist = *from_invlist_ptr; + SV* to_invmap_sv = *to_invmap_ptr; + UV* map = (UV *) SvPVX(to_invmap_sv); - PERL_ARGS_ASSERT_DO_TRANS_SIMPLE_UTF8; + PERL_ARGS_ASSERT_DO_TRANS_COUNT_INVMAP; s = (U8*)SvPV_nomg(sv, len); - if (!SvUTF8(sv)) { - hibit = ! is_utf8_invariant_string(s, len); - if (hibit) { - s = bytes_to_utf8(s, &len); - } - } + send = s + len; - start = s; - svp = hv_fetchs(hv, "FINAL", FALSE); - if (svp) - final = SvUV(*svp); + while (s < send) { + UV from; + SSize_t i; + STRLEN s_len; + + /* Get the code point of the next character in the string */ + if (! SvUTF8(sv) || UTF8_IS_INVARIANT(*s)) { + from = *s; + s_len = 1; + } + else { + from = utf8_to_uvchr_buf(s, send, &s_len); + if (from == 0 && *s != '\0') { + _force_out_malformed_utf8_message(s, send, 0, /*die*/TRUE); + } + } - if (grows) { - /* d needs to be bigger than s, in case e.g. upgrading is required */ - Newx(d, len * 3 + UTF8_MAXBYTES, U8); - dend = d + len * 3; - dstart = d; - } - else { - dstart = d = s; - dend = d + len; - } + /* Look the code point up in the data structure for this tr/// to get + * what it maps to */ + i = _invlist_search(from_invlist, from); + assert(i >= 0); - while (s < send) { - const UV uv = swash_fetch(rv, s, TRUE); - if (uv < none) { - s += UTF8SKIP(s); - matches++; - d = uvchr_to_utf8(d, uv); - } - else if (uv == none) { - const int i = UTF8SKIP(s); - Move(s, d, i, U8); - d += i; - s += i; - } - else if (uv == extra) { - s += UTF8SKIP(s); - matches++; - d = uvchr_to_utf8(d, final); - } - else - s += UTF8SKIP(s); - - if (d > dend) { - const STRLEN clen = d - dstart; - const STRLEN nlen = dend - dstart + len + UTF8_MAXBYTES; - if (!grows) - Perl_croak(aTHX_ "panic: do_trans_simple_utf8 line %d",__LINE__); - Renew(dstart, nlen + UTF8_MAXBYTES, U8); - d = dstart + clen; - dend = dstart + nlen; - } - } - if (grows || hibit) { - sv_setpvn(sv, (char*)dstart, d - dstart); - Safefree(dstart); - if (grows && hibit) - Safefree(start); - } - else { - *d = '\0'; - SvCUR_set(sv, d - dstart); + if (map[i] != (UV) TR_UNLISTED) { + matches++; + } + + s += s_len; } - SvSETMAGIC(sv); - SvUTF8_on(sv); return matches; } /* Helper function for do_trans(). - * Handles utf8 cases(*) where search and replacement charlists are - * identical: so the string isn't modified, and only a count of modifiable - * chars is needed. - * Note that it doesn't handle /d or /s, since these modify the string - * even if the replacement charlist is empty. - * (*) i.e. where the search or replacement charlists are utf8. sv may - * or may not be utf8. + * Handles cases where an inversion map implementation is to be used and the + * search and replacement charlists are either not identical or flags are + * present. + * + * sv may or may not be utf8. */ STATIC Size_t -S_do_trans_count_utf8(pTHX_ SV * const sv) +S_do_trans_invmap(pTHX_ SV * const sv, AV * const invmap) { - const U8 *s; - const U8 *start = NULL; - const U8 *send; + U8 *s; + U8 *send; + U8 *d; + U8 *s0; + U8 *d0; Size_t matches = 0; STRLEN len; - SV* const rv = -#ifdef USE_ITHREADS - PAD_SVl(cPADOP->op_padix); -#else - MUTABLE_SV(cSVOP->op_sv); -#endif - HV* const hv = MUTABLE_HV(SvRV(rv)); - SV* const * const svp = hv_fetchs(hv, "NONE", FALSE); - const UV none = svp ? SvUV(*svp) : 0x7fffffff; - const UV extra = none + 1; - U8 hibit = 0; - - PERL_ARGS_ASSERT_DO_TRANS_COUNT_UTF8; + SV** const from_invlist_ptr = av_fetch(invmap, 0, TRUE); + SV** const to_invmap_ptr = av_fetch(invmap, 1, TRUE); + SV** const to_expansion_ptr = av_fetch(invmap, 2, TRUE); + NV max_expansion = SvNV(*to_expansion_ptr); + SV* from_invlist = *from_invlist_ptr; + SV* to_invmap_sv = *to_invmap_ptr; + UV* map = (UV *) SvPVX(to_invmap_sv); + UV previous_map = TR_OOB; + const bool squash = cBOOL(PL_op->op_private & OPpTRANS_SQUASH); + const bool delete_unfound = cBOOL(PL_op->op_private & OPpTRANS_DELETE); + bool inplace = ! cBOOL(PL_op->op_private & OPpTRANS_GROWS); + const UV* from_array = invlist_array(from_invlist); + UV final_map; + bool out_is_utf8 = SvUTF8(sv); + STRLEN s_len; + + PERL_ARGS_ASSERT_DO_TRANS_INVMAP; + + /* A third element in the array indicates that the replacement list was + * shorter than the search list, and this element contains the value to use + * for the items that don't correspond */ + if (av_top_index(invmap) >= 3) { + SV** const final_map_ptr = av_fetch(invmap, 3, TRUE); + SV* const final_map_sv = *final_map_ptr; + final_map = SvUV(final_map_sv); + } - s = (const U8*)SvPV_nomg_const(sv, len); - if (!SvUTF8(sv)) { - hibit = ! is_utf8_invariant_string(s, len); - if (hibit) { - start = s = bytes_to_utf8(s, &len); - } + /* If there is something in the transliteration that could force the input + * to be changed to UTF-8, we don't know if we can do it in place, so + * assume cannot */ + if (! out_is_utf8 && (PL_op->op_private & OPpTRANS_CAN_FORCE_UTF8)) { + inplace = FALSE; + if (max_expansion < 2) { + max_expansion = 2; + } } + + s = (U8*)SvPV_nomg(sv, len); send = s + len; + s0 = s; - while (s < send) { - const UV uv = swash_fetch(rv, s, TRUE); - if (uv < none || uv == extra) - matches++; - s += UTF8SKIP(s); + /* We know by now if there are some possible input strings whose + * transliterations are longer than the input. If none can, we just edit + * in place. */ + if (inplace) { + d0 = d = s; + } + else { + /* Here, we can't edit in place. We have no idea how much, if any, + * this particular input string will grow. However, the compilation + * calculated the maximum expansion possible. Use that to allocale + * based on the worst case scenario. */ + Newx(d, len * max_expansion + 1, U8); + d0 = d; } - if (hibit) - Safefree(start); - return matches; -} + restart: + /* Do the actual transliteration */ + while (s < send) { + UV from; + UV to; + SSize_t i; + STRLEN s_len; + + /* Get the code point of the next character in the string */ + if (! SvUTF8(sv) || UTF8_IS_INVARIANT(*s)) { + from = *s; + s_len = 1; + } + else { + from = utf8_to_uvchr_buf(s, send, &s_len); + if (from == 0 && *s != '\0') { + _force_out_malformed_utf8_message(s, send, 0, /*die*/TRUE); + } + } -/* Helper function for do_trans(). - * Handles utf8 cases(*) involving the /c, /d, /s flags, - * and where search and replacement charlists aren't identical. - * (*) i.e. where the search or replacement charlists are utf8. sv may - * or may not be utf8. - */ + /* Look the code point up in the data structure for this tr/// to get + * what it maps to */ + i = _invlist_search(from_invlist, from); + assert(i >= 0); -STATIC Size_t -S_do_trans_complex_utf8(pTHX_ SV * const sv) -{ - U8 *start, *send; - U8 *d; - Size_t matches = 0; - const bool squash = cBOOL(PL_op->op_private & OPpTRANS_SQUASH); - const bool del = cBOOL(PL_op->op_private & OPpTRANS_DELETE); - const bool grows = cBOOL(PL_op->op_private & OPpTRANS_GROWS); - SV* const rv = -#ifdef USE_ITHREADS - PAD_SVl(cPADOP->op_padix); -#else - MUTABLE_SV(cSVOP->op_sv); -#endif - HV * const hv = MUTABLE_HV(SvRV(rv)); - SV * const *svp = hv_fetchs(hv, "NONE", FALSE); - const UV none = svp ? SvUV(*svp) : 0x7fffffff; - const UV extra = none + 1; - UV final = 0; - bool havefinal = FALSE; - STRLEN len; - U8 *dstart, *dend; - U8 hibit = 0; - U8 *s = (U8*)SvPV_nomg(sv, len); + to = map[i]; - PERL_ARGS_ASSERT_DO_TRANS_COMPLEX_UTF8; + if (to == (UV) TR_UNLISTED) { /* Just copy the unreplaced character */ + if (UVCHR_IS_INVARIANT(from) || ! out_is_utf8) { + *d++ = from; + } + else if (SvUTF8(sv)) { + Move(s, d, s_len, U8); + d += s_len; + } + else { /* Convert to UTF-8 */ + append_utf8_from_native_byte(*s, &d); + } - if (!SvUTF8(sv)) { - hibit = ! is_utf8_invariant_string(s, len); - if (hibit) { - s = bytes_to_utf8(s, &len); + previous_map = to; + s += s_len; + continue; } - } - send = s + len; - start = s; - svp = hv_fetchs(hv, "FINAL", FALSE); - if (svp) { - final = SvUV(*svp); - havefinal = TRUE; - } + /* Everything else is counted as a match */ + matches++; - if (grows) { - /* d needs to be bigger than s, in case e.g. upgrading is required */ - Newx(d, len * 3 + UTF8_MAXBYTES, U8); - dend = d + len * 3; - dstart = d; - } - else { - dstart = d = s; - dend = d + len; - } + if (to == (UV) TR_SPECIAL_HANDLING) { + if (delete_unfound) { + previous_map = to; + s += s_len; + continue; + } - if (squash) { - UV puv = 0xfeedface; - while (s < send) { - UV uv = swash_fetch(rv, s, TRUE); - - if (d > dend) { - const STRLEN clen = d - dstart; - const STRLEN nlen = dend - dstart + len + UTF8_MAXBYTES; - if (!grows) - Perl_croak(aTHX_ "panic: do_trans_complex_utf8 line %d",__LINE__); - Renew(dstart, nlen + UTF8_MAXBYTES, U8); - d = dstart + clen; - dend = dstart + nlen; - } - if (uv < none) { - matches++; - s += UTF8SKIP(s); - if (uv != puv) { - d = uvchr_to_utf8(d, uv); - puv = uv; - } - continue; - } - else if (uv == none) { /* "none" is unmapped character */ - const int i = UTF8SKIP(s); - Move(s, d, i, U8); - d += i; - s += i; - puv = 0xfeedface; - continue; - } - else if (uv == extra && !del) { - matches++; - if (havefinal) { - s += UTF8SKIP(s); - if (puv != final) { - d = uvchr_to_utf8(d, final); - puv = final; - } - } - else { - STRLEN len; - uv = utf8n_to_uvchr(s, send - s, &len, UTF8_ALLOW_DEFAULT); - if (uv != puv) { - Move(s, d, len, U8); - d += len; - puv = uv; - } - s += len; - } - continue; - } - matches++; /* "none+1" is delete character */ - s += UTF8SKIP(s); - } - } - else { - while (s < send) { - const UV uv = swash_fetch(rv, s, TRUE); - if (d > dend) { - const STRLEN clen = d - dstart; - const STRLEN nlen = dend - dstart + len + UTF8_MAXBYTES; - if (!grows) - Perl_croak(aTHX_ "panic: do_trans_complex_utf8 line %d",__LINE__); - Renew(dstart, nlen + UTF8_MAXBYTES, U8); - d = dstart + clen; - dend = dstart + nlen; - } - if (uv < none) { - matches++; - s += UTF8SKIP(s); - d = uvchr_to_utf8(d, uv); - continue; - } - else if (uv == none) { /* "none" is unmapped character */ - const int i = UTF8SKIP(s); - Move(s, d, i, U8); - d += i; - s += i; - continue; - } - else if (uv == extra && !del) { - matches++; - s += UTF8SKIP(s); - d = uvchr_to_utf8(d, final); - continue; - } - matches++; /* "none+1" is delete character */ - s += UTF8SKIP(s); - } + /* Use the final character in the replacement list */ + to = final_map; + } + else { /* Here the input code point is to be remapped. The actual + value is offset from the base of this entry */ + to += from - from_array[i]; + } + + /* If copying all occurrences, or this is the first occurrence, copy it + * to the output */ + if (! squash || to != previous_map) { + if (out_is_utf8) { + d = uvchr_to_utf8(d, to); + } + else { + if (to >= 256) { /* If need to convert to UTF-8, restart */ + out_is_utf8 = TRUE; + s = s0; + d = d0; + matches = 0; + goto restart; + } + *d++ = to; + } + } + + previous_map = to; + s += s_len; } - if (grows || hibit) { - sv_setpvn(sv, (char*)dstart, d - dstart); - Safefree(dstart); - if (grows && hibit) - Safefree(start); + + s_len = 0; + s += s_len; + if (! inplace) { + sv_setpvn(sv, (char*)d0, d - d0); } else { *d = '\0'; - SvCUR_set(sv, d - dstart); + SvCUR_set(sv, d - d0); + } + + if (! SvUTF8(sv) && out_is_utf8) { + SvUTF8_on(sv); } - SvUTF8_on(sv); SvSETMAGIC(sv); return matches; @@ -627,7 +540,8 @@ S_do_trans_complex_utf8(pTHX_ SV * const sv) /* Execute a tr//. sv is the value to be translated, while PL_op * should be an OP_TRANS or OP_TRANSR op, whose op_pv field contains a - * translation table or whose op_sv field contains a swash. + * translation table or whose op_sv field contains an inversion map. + * * Returns a count of number of characters translated */ @@ -636,31 +550,49 @@ Perl_do_trans(pTHX_ SV *sv) { STRLEN len; const U8 flags = PL_op->op_private; - const U8 hasutf = flags & (OPpTRANS_FROM_UTF | OPpTRANS_TO_UTF); + bool use_utf8_fcns = cBOOL(flags & OPpTRANS_USE_SVOP); + bool identical = cBOOL(flags & OPpTRANS_IDENTICAL); PERL_ARGS_ASSERT_DO_TRANS; - if (SvREADONLY(sv) && !(flags & OPpTRANS_IDENTICAL)) { + if (SvREADONLY(sv) && ! identical) { Perl_croak_no_modify(); } (void)SvPV_const(sv, len); if (!len) return 0; - if (!(flags & OPpTRANS_IDENTICAL)) { + if (! identical) { if (!SvPOKp(sv) || SvTHINKFIRST(sv)) (void)SvPV_force_nomg(sv, len); (void)SvPOK_only_UTF8(sv); } - /* If we use only OPpTRANS_IDENTICAL to bypass the READONLY check, - * we must also rely on it to choose the readonly strategy. - */ - if (flags & OPpTRANS_IDENTICAL) { - return hasutf ? do_trans_count_utf8(sv) : do_trans_count(sv, (OPtrans_map*)cPVOP->op_pv); - } else if (flags & (OPpTRANS_SQUASH|OPpTRANS_DELETE|OPpTRANS_COMPLEMENT)) { - return hasutf ? do_trans_complex_utf8(sv) : do_trans_complex(sv, (OPtrans_map*)cPVOP->op_pv); - } else { - return hasutf ? do_trans_simple_utf8(sv) : do_trans_simple(sv, (OPtrans_map*)cPVOP->op_pv); + if (use_utf8_fcns) { + SV* const map = +#ifdef USE_ITHREADS + PAD_SVl(cPADOP->op_padix); +#else + MUTABLE_SV(cSVOP->op_sv); +#endif + + if (identical) { + return do_trans_count_invmap(sv, (AV *) map); + } + else { + return do_trans_invmap(sv, (AV *) map); + } + } + else { + const OPtrans_map * const map = (OPtrans_map*)cPVOP->op_pv; + + if (identical) { + return do_trans_count(sv, map); + } + else if (flags & (OPpTRANS_SQUASH|OPpTRANS_DELETE|OPpTRANS_COMPLEMENT)) { + return do_trans_complex(sv, map); + } + else + return do_trans_simple(sv, map); } } @@ -1305,13 +1305,13 @@ S_do_op_dump_bar(pTHX_ I32 level, UV bar, PerlIO *file, const OP *o) case OP_TRANS: case OP_TRANSR: - if (o->op_private & (OPpTRANS_FROM_UTF | OPpTRANS_TO_UTF)) { - /* utf8: table stored as a swash */ + if (o->op_private & OPpTRANS_USE_SVOP) { + /* utf8: table stored as an inversion map */ #ifndef USE_ITHREADS - /* with ITHREADS, swash is stored in the pad, and the right pad + /* with ITHREADS, it is stored in the pad, and the right pad * may not be active here, so skip */ S_opdump_indent(aTHX_ o, level, bar, file, - "SWASH = 0x%" UVxf "\n", + "INVMAP = 0x%" UVxf "\n", PTR2UV(MUTABLE_SV(cSVOPo->op_sv))); #endif } @@ -2986,11 +2986,10 @@ Perl_op_class(pTHX_ const OP *o) * pointer to a table of shorts used to look up translations. * Under utf8, however, a simple table isn't practical; instead, * the OP is an SVOP (or, under threads, a PADOP), - * and the SV is a reference to a swash - * (i.e., an RV pointing to an HV). + * and the SV is an AV. */ return (!custom && - (o->op_private & (OPpTRANS_TO_UTF|OPpTRANS_FROM_UTF)) + (o->op_private & OPpTRANS_USE_SVOP) ) #if defined(USE_ITHREADS) ? OPclass_PADOP : OPclass_PVOP; @@ -1870,7 +1870,7 @@ Apd |void |sv_vsetpvfn |NN SV *const sv|NN const char *const pat|const STRLEN pa ApR |NV |str_to_version |NN SV *sv EXpR |SV* |swash_init |NN const char* pkg|NN const char* name|NN SV* listsv|I32 minbits|I32 none EXp |UV |swash_fetch |NN SV *swash|NN const U8 *ptr|bool do_utf8 -#if defined(PERL_IN_REGCOMP_C) +#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C) EiR |SV* |add_cp_to_invlist |NULLOK SV* invlist|const UV cp Ei |void |invlist_extend |NN SV* const invlist|const UV len Ei |void |invlist_set_len|NN SV* const invlist|const UV len|const bool offset @@ -1922,7 +1922,8 @@ EpX |SV* |invlist_clone |NN SV* const invlist|NULLOK SV* newlist #endif #if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) \ || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) \ - || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C) + || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C) \ + || defined(PERL_IN_DOOP_C) EiRT |UV* |invlist_array |NN SV* const invlist EiRT |bool |is_invlist |NULLOK SV* const invlist EiRT |bool* |get_invlist_offset_addr|NN SV* invlist @@ -2308,9 +2309,8 @@ p |void |init_constants SR |Size_t |do_trans_simple |NN SV * const sv|NN const OPtrans_map * const tbl SR |Size_t |do_trans_count |NN SV * const sv|NN const OPtrans_map * const tbl SR |Size_t |do_trans_complex |NN SV * const sv|NN const OPtrans_map * const tbl -SR |Size_t |do_trans_simple_utf8 |NN SV * const sv -SR |Size_t |do_trans_count_utf8 |NN SV * const sv -SR |Size_t |do_trans_complex_utf8 |NN SV * const sv +SR |Size_t |do_trans_invmap |NN SV * const sv|NN AV * const map +SR |Size_t |do_trans_count_invmap |NN SV * const sv|NN AV * const map #endif #if defined(PERL_IN_GV_C) @@ -1014,7 +1014,6 @@ # endif # if defined(PERL_IN_REGCOMP_C) #define add_above_Latin1_folds(a,b,c) S_add_above_Latin1_folds(aTHX_ a,b,c) -#define add_cp_to_invlist(a,b) S_add_cp_to_invlist(aTHX_ a,b) #define add_data S_add_data #define add_multi_match(a,b,c) S_add_multi_match(aTHX_ a,b,c) #define change_engine_size(a,b) S_change_engine_size(aTHX_ a,b) @@ -1024,20 +1023,13 @@ #define edit_distance S_edit_distance #define get_ANYOFM_contents(a) S_get_ANYOFM_contents(aTHX_ a) #define get_ANYOF_cp_list_for_ssc(a,b) S_get_ANYOF_cp_list_for_ssc(aTHX_ a,b) -#define get_invlist_iter_addr S_get_invlist_iter_addr #define grok_bslash_N(a,b,c,d,e,f,g) S_grok_bslash_N(aTHX_ a,b,c,d,e,f,g) #define handle_named_backref(a,b,c,d) S_handle_named_backref(aTHX_ a,b,c,d) #define handle_possible_posix(a,b,c,d,e) S_handle_possible_posix(aTHX_ a,b,c,d,e) #define handle_regex_sets(a,b,c,d,e) S_handle_regex_sets(aTHX_ a,b,c,d,e) #define handle_user_defined_property(a,b,c,d,e,f,g,h,i,j) Perl_handle_user_defined_property(aTHX_ a,b,c,d,e,f,g,h,i,j) #define invlist_contents(a,b) S_invlist_contents(aTHX_ a,b) -#define invlist_extend(a,b) S_invlist_extend(aTHX_ a,b) -#define invlist_highest S_invlist_highest #define invlist_is_iterating S_invlist_is_iterating -#define invlist_iterfinish S_invlist_iterfinish -#define invlist_iterinit S_invlist_iterinit -#define invlist_iternext S_invlist_iternext -#define invlist_set_len(a,b,c) S_invlist_set_len(aTHX_ a,b,c) #define is_ssc_worth_it S_is_ssc_worth_it #define join_exact(a,b,c,d,e,f,g) S_join_exact(aTHX_ a,b,c,d,e,f,g) #define make_exactf_invlist(a,b) S_make_exactf_invlist(aTHX_ a,b) @@ -1083,6 +1075,16 @@ # if defined(PERL_IN_REGCOMP_C) || defined (PERL_IN_DUMP_C) || defined(PERL_IN_OP_C) #define _invlist_dump(a,b,c,d) Perl__invlist_dump(aTHX_ a,b,c,d) # endif +# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C) +#define add_cp_to_invlist(a,b) S_add_cp_to_invlist(aTHX_ a,b) +#define get_invlist_iter_addr S_get_invlist_iter_addr +#define invlist_extend(a,b) S_invlist_extend(aTHX_ a,b) +#define invlist_highest S_invlist_highest +#define invlist_iterfinish S_invlist_iterfinish +#define invlist_iterinit S_invlist_iterinit +#define invlist_iternext S_invlist_iternext +#define invlist_set_len(a,b,c) S_invlist_set_len(aTHX_ a,b,c) +# endif # if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_PERL_C) || defined(PERL_IN_UTF8_C) #define _invlistEQ(a,b,c) Perl__invlistEQ(aTHX_ a,b,c) #define _new_invlist_C_array(a) Perl__new_invlist_C_array(aTHX_ a) @@ -1094,7 +1096,7 @@ #endif #define regprop(a,b,c,d,e) Perl_regprop(aTHX_ a,b,c,d,e) # endif -# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C) +# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C) #define _invlist_contains_cp S__invlist_contains_cp #define _invlist_len S__invlist_len #define _invlist_search Perl__invlist_search @@ -1603,11 +1605,10 @@ # endif # if defined(PERL_IN_DOOP_C) #define do_trans_complex(a,b) S_do_trans_complex(aTHX_ a,b) -#define do_trans_complex_utf8(a) S_do_trans_complex_utf8(aTHX_ a) #define do_trans_count(a,b) S_do_trans_count(aTHX_ a,b) -#define do_trans_count_utf8(a) S_do_trans_count_utf8(aTHX_ a) +#define do_trans_count_invmap(a,b) S_do_trans_count_invmap(aTHX_ a,b) +#define do_trans_invmap(a,b) S_do_trans_invmap(aTHX_ a,b) #define do_trans_simple(a,b) S_do_trans_simple(aTHX_ a,b) -#define do_trans_simple_utf8(a) S_do_trans_simple_utf8(aTHX_ a) # endif # if defined(PERL_IN_DUMP_C) #define deb_curcv(a) S_deb_curcv(aTHX_ a) diff --git a/invlist_inline.h b/invlist_inline.h index 76d6dda998..33f8aee385 100644 --- a/invlist_inline.h +++ b/invlist_inline.h @@ -14,7 +14,8 @@ || defined(PERL_IN_REGEXEC_C) \ || defined(PERL_IN_TOKE_C) \ || defined(PERL_IN_PP_C) \ - || defined(PERL_IN_OP_C) + || defined(PERL_IN_OP_C) \ + || defined(PERL_IN_DOOP_C) /* An element is in an inversion list iff its index is even numbered: 0, 2, 4, * etc */ @@ -92,7 +93,7 @@ S_invlist_array(SV* const invlist) } #endif -#if defined(PERL_IN_REGCOMP_C) +#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C) PERL_STATIC_INLINE void S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) diff --git a/lib/B/Deparse.pm b/lib/B/Deparse.pm index 88555ffbe3..1ae4619d5d 100644 --- a/lib/B/Deparse.pm +++ b/lib/B/Deparse.pm @@ -279,6 +279,8 @@ BEGIN { for (qw[ const stringify rv2sv list glob pushmark null aelem # _pessimise_walk(): recursively walk the optree of a sub, # possibly undoing optimisations along the way. +sub DEBUG { 0 } + sub _pessimise_walk { my ($self, $startop) = @_; @@ -5714,100 +5716,81 @@ sub tr_chr { } } -# XXX This doesn't yet handle all cases correctly either +sub tr_invmap { + my ($invlist_ref, $map_ref) = @_; -sub tr_decode_utf8 { - my($swash_hv, $flags) = @_; - my %swash = $swash_hv->ARRAY; - my $final = undef; - $final = $swash{'FINAL'}->IV if exists $swash{'FINAL'}; - my $none = $swash{"NONE"}->IV; - my $extra = $none + 1; - my(@from, @delfrom, @to); - my $line; - foreach $line (split /\n/, $swash{'LIST'}->PV) { - my($min, $max, $result) = split(/\t/, $line); - $min = hex $min; - if (length $max) { - $max = hex $max; - } else { - $max = $min; - } - $result = hex $result; - if ($result == $extra) { - push @delfrom, [$min, $max]; - } else { - push @from, [$min, $max]; - push @to, [$result, $result + $max - $min]; - } - } - for my $i (0 .. $#from) { - if ($from[$i][0] == ord '-') { - unshift @from, splice(@from, $i, 1); - unshift @to, splice(@to, $i, 1); - last; - } elsif ($from[$i][1] == ord '-') { - $from[$i][1]--; - $to[$i][1]--; - unshift @from, ord '-'; - unshift @to, ord '-'; - last; - } - } - for my $i (0 .. $#delfrom) { - if ($delfrom[$i][0] == ord '-') { - push @delfrom, splice(@delfrom, $i, 1); - last; - } elsif ($delfrom[$i][1] == ord '-') { - $delfrom[$i][1]--; - push @delfrom, ord '-'; - last; - } + my $infinity = ~0 >> 1; # IV_MAX + my $from = ""; + my $to = ""; + + for my $i (0.. @$invlist_ref - 1) { + my $this_from = $invlist_ref->[$i]; + my $map = $map_ref->[$i]; + my $upper = ($i < @$invlist_ref - 1) + ? $invlist_ref->[$i+1] + : $infinity; + my $range = $upper - $this_from - 1; + if (DEBUG) { + print STDERR "i=$i, from=$this_from, upper=$upper, range=$range\n"; + } + next if $map == ~0; + next if $map == ~0 - 1; + $from .= tr_chr($this_from); + $to .= tr_chr($map); + next if $range == 0; # Single code point + if ($range == 1) { # Adjacent code points + $from .= tr_chr($this_from + 1); + $to .= tr_chr($map + 1); + } + elsif ($upper != $infinity) { + $from .= "-" . tr_chr($this_from + $range); + $to .= "-" . tr_chr($map + $range); + } + else { + $from .= "-INFTY"; + $to .= "-INFTY"; + } } - if (defined $final and $to[$#to][1] != $final) { - push @to, [$final, $final]; + + return ($from, $to); +} + +sub tr_decode_utf8 { + my($tr_av, $flags) = @_; + printf STDERR "flags=0x%x\n", $flags if DEBUG; + my $invlist = $tr_av->ARRAYelt(0); + my @invlist = unpack("J*", $invlist->PV); + my @map = unpack("J*", $tr_av->ARRAYelt(1)->PV); + + if (DEBUG) { + for my $i (0 .. @invlist - 1) { + printf STDERR "[%d]\t%x\t", $i, $invlist[$i]; + my $map = $map[$i]; + if ($map == ~0) { + print STDERR "TR_UNMAPPED\n"; + } + elsif ($map == ~0 - 1) { + print STDERR "TR_SPECIAL\n"; + } + else { + printf STDERR "%x\n", $map; + } + } } - push @from, @delfrom; + + my ($from, $to) = tr_invmap(\@invlist, \@map); + if ($flags & OPpTRANS_COMPLEMENT) { - my @newfrom; - my $next = 0; - for my $i (0 .. $#from) { - push @newfrom, [$next, $from[$i][0] - 1]; - $next = $from[$i][1] + 1; - } - @from = (); - for my $range (@newfrom) { - if ($range->[0] <= $range->[1]) { - push @from, $range; - } - } - } - my($from, $to, $diff); - for my $chunk (@from) { - $diff = $chunk->[1] - $chunk->[0]; - if ($diff > 1) { - $from .= tr_chr($chunk->[0]) . "-" . tr_chr($chunk->[1]); - } elsif ($diff == 1) { - $from .= tr_chr($chunk->[0]) . tr_chr($chunk->[1]); - } else { - $from .= tr_chr($chunk->[0]); - } + shift @map; + pop @invlist; + my $throw_away; + ($from, $throw_away) = tr_invmap(\@invlist, \@map); } - for my $chunk (@to) { - $diff = $chunk->[1] - $chunk->[0]; - if ($diff > 1) { - $to .= tr_chr($chunk->[0]) . "-" . tr_chr($chunk->[1]); - } elsif ($diff == 1) { - $to .= tr_chr($chunk->[0]) . tr_chr($chunk->[1]); - } else { - $to .= tr_chr($chunk->[0]); - } + + if (DEBUG) { + print STDERR "Returning ", escape_str($from), "/", + escape_str($to), "\n"; } - #$final = sprintf("%04x", $final) if defined $final; - #$none = sprintf("%04x", $none) if defined $none; - #$extra = sprintf("%04x", $extra) if defined $extra; - #print STDERR "final: $final\n none: $none\nextra: $extra\n"; - #print STDERR $swash{'LIST'}->PV; return (escape_str($from), escape_str($to)); } @@ -5821,9 +5804,9 @@ sub pp_trans { ($from, $to) = tr_decode_byte($op->pv, $priv_flags); } elsif ($class eq "PADOP") { ($from, $to) - = tr_decode_utf8($self->padval($op->padix)->RV, $priv_flags); + = tr_decode_utf8($self->padval($op->padix), $priv_flags); } else { # class($op) eq "SVOP" - ($from, $to) = tr_decode_utf8($op->sv->RV, $priv_flags); + ($from, $to) = tr_decode_utf8($op->sv, $priv_flags); } my $flags = ""; $flags .= "c" if $priv_flags & OPpTRANS_COMPLEMENT; @@ -1059,7 +1059,7 @@ Perl_op_clear(pTHX_ OP *o) case OP_TRANS: case OP_TRANSR: if ( (o->op_type == OP_TRANS || o->op_type == OP_TRANSR) - && (o->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF))) + && (o->op_private & OPpTRANS_USE_SVOP)) { #ifdef USE_ITHREADS if (cPADOPo->op_padix > 0) { @@ -6784,8 +6784,8 @@ static int uvcompare(const void *a, const void *b) * OPpTRANS_SQUASH * OPpTRANS_DELETE * flags as appropriate; this function may add - * OPpTRANS_FROM_UTF - * OPpTRANS_TO_UTF + * OPpTRANS_USE_SVOP + * OPpTRANS_CAN_FORCE_UTF8 * OPpTRANS_IDENTICAL * OPpTRANS_GROWS * flags @@ -6794,416 +6794,1028 @@ static int uvcompare(const void *a, const void *b) static OP * S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) { + /* This function compiles a tr///, from data gathered from toke.c, into a + * form suitable for use by do_trans() in doop.c at runtime. + * + * It first normalizes the data, while discarding extraneous inputs; then + * writes out the compiled data. The normalization allows for complete + * analysis, and avoids some false negatives and positives earlier versions + * of this code had. + * + * The normalization form is an inversion map (described below in detail). + * This is essentially the compiled form for tr///'s that require UTF-8, + * and its easy to use it to write the 257-byte table for tr///'s that + * don't need UTF-8. That table is identical to what's been in use for + * many perl versions, except that it doesn't handle some edge cases that + * it used to, involving code points above 255. The UTF-8 form now handles + * these. (This could be changed with extra coding should it shown to be + * desirable.) + * + * If the complement (/c) option is specified, the lhs string (tstr) is + * parsed into an inversion list. Complementing these is trivial. Then a + * complemented tstr is built from that, and used thenceforth. This hides + * the fact that it was complemented from almost all successive code. + * + * One of the important characteristics to know about the input is whether + * the transliteration may be done in place, or does a temporary need to be + * allocated, then copied. If the replacement for every character in every + * possible string takes up no more bytes than the the character it + * replaces, then it can be edited in place. Otherwise the replacement + * could "grow", depending on the strings being processed. Some inputs + * won't grow, and might even shrink under /d, but some inputs could grow, + * so we have to assume any given one might grow. On very long inputs, the + * temporary could eat up a lot of memory, so we want to avoid it if + * possible. For non-UTF-8 inputs, everything is single-byte, so can be + * edited in place, unless there is something in the pattern that could + * force it into UTF-8. The inversion map makes it feasible to determine + * this. Previous versions of this code pretty much punted on determining + * if UTF-8 could be edited in place. Now, this code is rigorous in making + * that determination. + * + * Another characteristic we need to know is whether the lhs and rhs are + * identical. If so, and no other flags are present, the only effect of + * the tr/// is to count the characters present in the input that are + * mentioned in the lhs string. The implementation of that is easier and + * runs faster than the more general case. Normalizing here allows for + * accurate determination of this. Previously there were false negatives + * possible. + * + * Instead of 'transliterated', the comments here use 'unmapped' for the + * characters that are left unchanged by the operation; otherwise they are + * 'mapped' + * + * The lhs of the tr/// is here referred to as the t side. + * The rhs of the tr/// is here referred to as the r side. + */ + SV * const tstr = ((SVOP*)expr)->op_sv; SV * const rstr = ((SVOP*)repl)->op_sv; STRLEN tlen; STRLEN rlen; - const U8 *t = (U8*)SvPV_const(tstr, tlen); - const U8 *r = (U8*)SvPV_const(rstr, rlen); - Size_t i, j; - bool grows = FALSE; - OPtrans_map *tbl; - SSize_t struct_size; /* malloced size of table struct */ - + const U8 * t0 = (U8*)SvPV_const(tstr, tlen); + const U8 * r0 = (U8*)SvPV_const(rstr, rlen); + const U8 * t = t0; + const U8 * r = r0; + Size_t t_count = 0, r_count = 0; /* Number of characters in search and + replacement lists */ + + /* khw thinks some of the private flags for this op are quaintly named. + * OPpTRANS_GROWS for example is TRUE if the replacement for some lhs + * character when represented in UTF-8 is longer than the original + * character's UTF-8 representation */ const bool complement = cBOOL(o->op_private & OPpTRANS_COMPLEMENT); const bool squash = cBOOL(o->op_private & OPpTRANS_SQUASH); const bool del = cBOOL(o->op_private & OPpTRANS_DELETE); - SV* swash; + + /* Set to true if there is some character < 256 in the lhs that maps to > + * 255. If so, a non-UTF-8 match string can be forced into requiring to be + * in UTF-8 by a tr/// operation. */ + bool can_force_utf8 = FALSE; + + /* What is the maximum expansion factor in UTF-8 transliterations. If a + * 2-byte UTF-8 encoded character is to be replaced by a 3-byte one, its + * expansion factor is 1.5. This number is used at runtime to calculate + * how much space to allocate for non-inplace transliterations. Without + * this number, the worst case is 14, which is extremely unlikely to happen + * in real life, and would require significant memory overhead. */ + NV max_expansion = 1.; + + SSize_t t_range_count, r_range_count, min_range_count; + UV* t_array; + SV* t_invlist; + UV* r_map; + UV r_cp, t_cp; + IV t_cp_end = -1; + UV r_cp_end; + Size_t len; + AV* invmap; + UV final_map = TR_UNLISTED; /* The final character in the replacement + list, updated as we go along. Initialize + to something illegal */ + + bool rstr_utf8 = cBOOL(SvUTF8(rstr)); + bool tstr_utf8 = cBOOL(SvUTF8(tstr)); + + const U8* tend = t + tlen; + const U8* rend = r + rlen; + + SV * inverted_tstr = NULL; + + Size_t i; + unsigned int pass2; + + /* This routine implements detection of a transliteration having a longer + * UTF-8 representation than its source, by partitioning all the possible + * code points of the platform into equivalence classes of the same UTF-8 + * byte length in the first pass. As it constructs the mappings, it carves + * these up into smaller chunks, but doesn't merge any together. This + * makes it easy to find the instances it's looking for. A second pass is + * done after this has been determined which merges things together to + * shrink the table for runtime. For ASCII platforms, the table is + * trivial, given below, and uses the fundamental characteristics of UTF-8 + * to construct the values. For EBCDIC, it isn't so, and we rely on a + * table constructed by the perl script that generates these kinds of + * things */ +#ifndef EBCDIC + UV PL_partition_by_byte_length[] = { + 0, + 0x80, + (32 * (1UL << ( UTF_ACCUMULATION_SHIFT))), + (32 * (1UL << ( UTF_ACCUMULATION_SHIFT))), + (16 * (1UL << (2 * UTF_ACCUMULATION_SHIFT))), + ( 8 * (1UL << (3 * UTF_ACCUMULATION_SHIFT))), + ( 4 * (1UL << (4 * UTF_ACCUMULATION_SHIFT))), + ( 2 * (1UL << (5 * UTF_ACCUMULATION_SHIFT))) + +# ifdef UV_IS_QUAD + , + ( (1UL << (6 * UTF_ACCUMULATION_SHIFT))) + +# endif +#endif + }; PERL_ARGS_ASSERT_PMTRANS; PL_hints |= HINT_BLOCK_SCOPE; - if (SvUTF8(tstr)) - o->op_private |= OPpTRANS_FROM_UTF; + /* If /c, the search list is sorted and complemented. This is now done by + * creating an inversion list from it, and then trivially inverting that. + * The previous implementation used qsort, but creating the list + * automatically keeps it sorted as we go along */ + if (complement) { + UV start, end; + SV * inverted_tlist = _new_invlist(tlen); + Size_t temp_len; + + while (t < tend) { + + /* Non-utf8 strings don't have ranges, so each character is listed + * out */ + if (! tstr_utf8) { + inverted_tlist = add_cp_to_invlist(inverted_tlist, *t); + t++; + } + else { /* But UTF-8 strings have been parsed in toke.c to have + * ranges if appropriate. */ + UV t_cp; + Size_t t_char_len; + + /* Get the first character */ + t_cp = valid_utf8_to_uvchr(t, &t_char_len); + t += t_char_len; + + /* If the next byte indicates that this wasn't the first + * element of a range, the range is just this one */ + if (t >= tend || *t != RANGE_INDICATOR) { + inverted_tlist = add_cp_to_invlist(inverted_tlist, t_cp); + } + else { /* Otherwise, ignore the indicator byte, and get the + final element, and add the whole range */ + t++; + t_cp_end = valid_utf8_to_uvchr(t, &t_char_len); + t += t_char_len; + + inverted_tlist = _add_range_to_invlist(inverted_tlist, + t_cp, t_cp_end); + } + } + } /* End of parse through tstr */ + + /* The inversion list is done; now invert it */ + _invlist_invert(inverted_tlist); + + /* Now go through the inverted list and create a new tstr for the rest + * of the routine to use. Since the UTF-8 version can have ranges, and + * can be much more compact than the non-UTF-8 version, we create the + * string in UTF-8 even if not necessary. (This is just an intermediate + * value that gets thrown away anyway.) */ + invlist_iterinit(inverted_tlist); + inverted_tstr = newSVpvs(""); + while (invlist_iternext(inverted_tlist, &start, &end)) { + U8 temp[UTF8_MAXBYTES]; + U8 * temp_end_pos; + + /* IV_MAX keeps things from going out of bounds */ + start = MIN(IV_MAX, start); + end = MIN(IV_MAX, end); + + temp_end_pos = uvchr_to_utf8(temp, start); + sv_catpvn(inverted_tstr, (char *) temp, temp_end_pos - temp); + + if (start != end) { + Perl_sv_catpvf(aTHX_ inverted_tstr, "%c", RANGE_INDICATOR); + temp_end_pos = uvchr_to_utf8(temp, end); + sv_catpvn(inverted_tstr, (char *) temp, temp_end_pos - temp); + } + } + + /* Set up so the remainder of the routine uses this complement, instead + * of the actual input */ + t0 = t = (U8*)SvPV_const(inverted_tstr, temp_len); + tend = t0 + temp_len; + tstr_utf8 = TRUE; + + SvREFCNT_dec_NN(inverted_tlist); + } + + /* For non-/d, an empty rhs means to use the lhs */ + if (rlen == 0 && ! del) { + r0 = t0; + rend = tend; + rstr_utf8 = tstr_utf8; + } + + t_invlist = _new_invlist(1); + + /* Parse the (potentially adjusted) input, creating the inversion map. + * This is done in two passes. The first pass is to determine if the + * transliteration can be done in place. The inversion map it creates + * could be used, but generally would be larger and slower to run than the + * output of the second pass, which starts with a more compact table and + * allows more ranges to be merged */ + for (pass2 = 0; pass2 < 2; pass2++) { - if (SvUTF8(rstr)) - o->op_private |= OPpTRANS_TO_UTF; + /* Initialize to a single range */ + t_invlist = _add_range_to_invlist(t_invlist, 0, UV_MAX); - if (o->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF)) { + /* In the second pass, we just have the single range */ - /* for utf8 translations, op_sv will be set to point to a swash - * containing codepoint ranges. This is done by first assembling - * a textual representation of the ranges in listsv then compiling - * it using swash_init(). For more details of the textual format, - * see L<perlunicode.pod/"User-Defined Character Properties"> . + if (pass2) { + len = 1; + t_array = invlist_array(t_invlist); + } + else { + + /* But in the first pass, the lhs is partitioned such that the + * number of UTF-8 bytes required to represent a code point in each + * partition is the same as the number for any other code point in + * that partion. We copy the pre-compiled partion. */ + len = C_ARRAY_LENGTH(PL_partition_by_byte_length); + invlist_extend(t_invlist, len); + t_array = invlist_array(t_invlist); + Copy(PL_partition_by_byte_length, t_array, len, UV); + invlist_set_len(t_invlist, + len, + *(get_invlist_offset_addr(t_invlist))); + Newx(r_map, len + 1, UV); + } + + /* And the mapping of each of the ranges is initialized. Initially, + * everything is TR_UNLISTED. */ + for (i = 0; i < len; i++) { + r_map[i] = TR_UNLISTED; + } + + t = t0; + t_count = 0; + r = r0; + r_count = 0; + t_range_count = r_range_count = 0; + + /* Now go through the search list constructing an inversion map. The + * input is not necessarily in any particular order. Making it an + * inversion map orders it, potentially simplifying, and makes it easy + * to deal with at run time. This is the only place in core that + * generates an inversion map; if others were introduced, it might be + * better to create general purpose routines to handle them. + * (Inversion maps are created in perl in other places.) + * + * An inversion map consists of two parallel arrays. One is + * essentially an inversion list: an ordered list of code points such + * that each element gives the first code point of a range of + * consecutive code points that map to the element in the other array + * that has the same index as this one (in other words, the + * corresponding element). Thus the range extends up to (but not + * including) the code point given by the next higher element. In a + * true inversion map, the corresponding element in the other array + * gives the mapping of the first code point in the range, with the + * understanding that the next higher code point in the inversion + * list's range will map to the next higher code point in the map. + * + * So if at element [i], let's say we have: + * + * t_invlist r_map + * [i] A a + * + * This means that A => a, B => b, C => c.... Let's say that the + * situation is such that: + * + * [i+1] L -1 + * + * This means the sequence that started at [i] stops at K => k. This + * illustrates that you need to look at the next element to find where + * a sequence stops. Except, the highest element in the inversion list + * begins a range that is understood to extend to the platform's + * infinity. + * + * This routine modifies traditional inversion maps to reserve two + * mappings: + * + * TR_UNLISTED (or -1) indicates that the no code point in the range + * is listed in the tr/// searchlist. At runtime, these are + * always passed through unchanged. In the inversion map, all + * points in the range are mapped to -1, instead of increasing, + * like the 'L' in the example above. + * + * We start the parse with every code point mapped to this, and as + * we parse and find ones that are listed in the search list, we + * carve out ranges as we go along that override that. + * + * TR_SPECIAL_HANDLING (or -2) indicates that every code point in the + * range needs special handling. Again, all code points in the + * range are mapped to -2, instead of increasing. + * + * Under /d this value means the code point should be deleted from + * the transliteration when encountered. + * + * Otherwise, it marks that every code point in the range is to + * map to the final character in the replacement list. This + * happens only when the replacement list is shorter than the + * search one, so there are things in the search list that have no + * correspondence in the replacement list. For example, in + * tr/a-z/A/, 'A' is the final value, and the inversion map + * generated for this would be like this: + * \0 => -1 + * a => A + * b-z => -2 + * z+1 => -1 + * 'A' appears once, then the remainder of the range maps to -2. + * The use of -2 isn't strictly necessary, as an inversion map is + * capable of representing this situation, but not nearly so + * compactly, and this is actually quite commonly encountered. + * Indeed, the original design of this code used a full inversion + * map for this. But things like + * tr/\0-\x{FFFF}/A/ + * generated huge data structures, slowly, and the execution was + * also slow. So the current scheme was implemented. + * + * So, if the next element in our example is: + * + * [i+2] Q q + * + * Then all of L, M, N, O, and P map to TR_UNLISTED. If the next + * elements are + * + * [i+3] R z + * [i+4] S TR_UNLISTED + * + * Then Q => q; R => z; and S => TR_UNLISTED. If [i+4] (the 'S') is + * the final element in the arrays, every code point from S to infinity + * maps to TR_UNLISTED. + * */ + /* Finish up range started in what otherwise would + * have been the final iteration */ + while (t < tend || t_range_count > 0) { + bool adjacent_to_range_above = FALSE; + bool adjacent_to_range_below = FALSE; + + bool merge_with_range_above = FALSE; + bool merge_with_range_below = FALSE; + + SSize_t i, span, invmap_range_length_remaining; + + /* If we are in the middle of processing a range in the 'target' + * side, the previous iteration has set us up. Otherwise, look at + * the next character in the search list */ + if (t_range_count <= 0) { + if (! tstr_utf8) { + + /* Here, not in the middle of a range, and not UTF-8. The + * next code point is the single byte where we're at */ + t_cp = *t; + t_range_count = 1; + t++; + } + else { + Size_t t_char_len; + + /* Here, not in the middle of a range, and is UTF-8. The + * next code point is the next UTF-8 char in the input. We + * know the input is valid, because the toker constructed + * it */ + t_cp = valid_utf8_to_uvchr(t, &t_char_len); + t += t_char_len; + + /* UTF-8 strings (only) have been parsed in toke.c to have + * ranges. See if the next byte indicates that this was + * the first element of a range. If so, get the final + * element and calculate the range size. If not, the range + * size is 1 */ + if (t < tend && *t == RANGE_INDICATOR) { + t++; + t_range_count = valid_utf8_to_uvchr(t, &t_char_len) + - t_cp + 1; + t += t_char_len; + } + else { + t_range_count = 1; + } + } - SV* const listsv = newSVpvs("# comment\n"); - SV* transv = NULL; - const U8* tend = t + tlen; - const U8* rend = r + rlen; - STRLEN ulen; - UV tfirst = 1; - UV tlast = 0; - IV tdiff; - STRLEN tcount = 0; - UV rfirst = 1; - UV rlast = 0; - IV rdiff; - STRLEN rcount = 0; - IV diff; - I32 none = 0; - U32 max = 0; - I32 bits; - I32 havefinal = 0; - U32 final = 0; - const I32 from_utf = o->op_private & OPpTRANS_FROM_UTF; - const I32 to_utf = o->op_private & OPpTRANS_TO_UTF; - U8* tsave = NULL; - U8* rsave = NULL; - const U32 flags = UTF8_ALLOW_DEFAULT; - - if (!from_utf) { - STRLEN len = tlen; - t = tsave = bytes_to_utf8(t, &len); - tend = t + len; - } - if (!to_utf && rlen) { - STRLEN len = rlen; - r = rsave = bytes_to_utf8(r, &len); - rend = r + len; - } + /* Count the total number of listed code points * */ + t_count += t_range_count; + } -/* There is a snag with this code on EBCDIC: scan_const() in toke.c has - * encoded chars in native encoding which makes ranges in the EBCDIC 0..255 - * odd. */ + /* Similarly, get the next character in the replacement list */ + if (r_range_count <= 0) { + if (r >= rend) { - if (complement) { - /* utf8 and /c: - * replace t/tlen/tend with a version that has the ranges - * complemented - */ - U8 tmpbuf[UTF8_MAXBYTES+1]; - UV *cp; - UV nextmin = 0; - Newx(cp, 2*tlen, UV); - i = 0; - transv = newSVpvs(""); - - /* convert search string into array of (start,end) range - * codepoint pairs stored in cp[]. Most "ranges" will start - * and end at the same char */ - while (t < tend) { - cp[2*i] = utf8n_to_uvchr(t, tend-t, &ulen, flags); - t += ulen; - /* the toker converts X-Y into (X, RANGE_INDICATOR, Y) */ - if (t < tend && *t == RANGE_INDICATOR) { - t++; - cp[2*i+1] = utf8n_to_uvchr(t, tend-t, &ulen, flags); - t += ulen; - } - else { - cp[2*i+1] = cp[2*i]; - } - i++; - } + /* But if we've exhausted the rhs, there is nothing to map + * to, except the special handling one, and we make the + * range the same size as the lhs one. */ + r_cp = TR_SPECIAL_HANDLING; + r_range_count = t_range_count; + } + else { + if (! rstr_utf8) { + r_cp = *r; + r_range_count = 1; + r++; + } + else { + Size_t r_char_len; + + r_cp = valid_utf8_to_uvchr(r, &r_char_len); + r += r_char_len; + if (r < rend && *r == RANGE_INDICATOR) { + r++; + r_range_count = valid_utf8_to_uvchr(r, + &r_char_len) - r_cp + 1; + r += r_char_len; + } + else { + r_range_count = 1; + } + } - /* sort the ranges */ - qsort(cp, i, 2*sizeof(UV), uvcompare); - - /* Create a utf8 string containing the complement of the - * codepoint ranges. For example if cp[] contains [A,B], [C,D], - * then transv will contain the equivalent of: - * join '', map chr, 0, RANGE_INDICATOR, A - 1, - * B + 1, RANGE_INDICATOR, C - 1, - * D + 1, RANGE_INDICATOR, 0x7fffffff; - * A range of a single char skips the RANGE_INDICATOR and - * end cp. - */ - for (j = 0; j < i; j++) { - UV val = cp[2*j]; - diff = val - nextmin; - if (diff > 0) { - t = uvchr_to_utf8(tmpbuf,nextmin); - sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf); - if (diff > 1) { - U8 range_mark = RANGE_INDICATOR; - t = uvchr_to_utf8(tmpbuf, val - 1); - sv_catpvn(transv, (char *)&range_mark, 1); - sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf); - } - } - val = cp[2*j+1]; - if (val >= nextmin) - nextmin = val + 1; - } + if (r_cp == TR_SPECIAL_HANDLING) { + r_range_count = t_range_count; + } - t = uvchr_to_utf8(tmpbuf,nextmin); - sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf); - { - U8 range_mark = RANGE_INDICATOR; - sv_catpvn(transv, (char *)&range_mark, 1); - } - t = uvchr_to_utf8(tmpbuf, 0x7fffffff); - sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf); - t = (const U8*)SvPVX_const(transv); - tlen = SvCUR(transv); - tend = t + tlen; - Safefree(cp); - } - else if (!rlen && !del) { - r = t; rlen = tlen; rend = tend; - } + /* This is the final character so far */ + final_map = r_cp + r_range_count - 1; - if (!squash) { - if ((!rlen && !del) || t == r || - (tlen == rlen && memEQ((char *)t, (char *)r, tlen))) - { - o->op_private |= OPpTRANS_IDENTICAL; - } - } + r_count += r_range_count; + } + } - /* extract char ranges from t and r and append them to listsv */ - - while (t < tend || tfirst <= tlast) { - /* see if we need more "t" chars */ - if (tfirst > tlast) { - tfirst = (I32)utf8n_to_uvchr(t, tend - t, &ulen, flags); - t += ulen; - if (t < tend && *t == RANGE_INDICATOR) { /* illegal utf8 val indicates range */ - t++; - tlast = (I32)utf8n_to_uvchr(t, tend - t, &ulen, flags); - t += ulen; - } - else - tlast = tfirst; - } + /* Here, we have the next things ready in both sides. They are + * potentially ranges. We try to process as big a chunk as + * possible at once, but the lhs and rhs must be synchronized, so + * things like tr/A-Z/a-ij-z/ will need to be processed in 2 chunks + * */ + min_range_count = MIN(t_range_count, r_range_count); + + /* Search the inversion list for the entry that contains the input + * code point <cp>. The inversion map was initialized to cover the + * entire range of possible inputs, so this should not fail. So + * the return value is the index into the list's array of the range + * that contains <cp>, that is, 'i' such that array[i] <= cp < + * array[i+1] */ + i = _invlist_search(t_invlist, t_cp); + assert(i >= 0); + + /* Here, the data structure might look like: + * + * index t r Meaning + * [i-1] J j # J-L => j-l + * [i] M -1 # M => default; as do N, O, P, Q + * [i+1] R x # R => x, S => x+1, T => x+2 + * [i+2] U y # U => y, V => y+1, ... + * ... + * [-1] Z -1 # Z => default; as do Z+1, ... infinity + * + * where 'x' and 'y' above are not to be taken literally. + * + * The maximum chunk we can handle in this loop iteration, is the + * smallest of the three components: the lhs 't_', the rhs 'r_', + * and the remainder of the range in element [i]. (In pass 1, that + * range will have everything in it be of the same class; we can't + * cross into another class.) 'min_range_count' already contains + * the smallest of the first two values. The final one is + * irrelevant if the map is to the special indicator */ + + invmap_range_length_remaining = ((Size_t) i + 1 < len) + ? t_array[i+1] - t_cp + : IV_MAX - t_cp; + span = MAX(1, MIN(min_range_count, invmap_range_length_remaining)); + + /* The end point of this chunk is where we are, plus the span, but + * never larger than the platform's infinity */ + t_cp_end = MIN(IV_MAX, t_cp + span - 1); + + if (r_cp == TR_SPECIAL_HANDLING) { + r_cp_end = TR_SPECIAL_HANDLING; + } + else { + r_cp_end = MIN(IV_MAX, r_cp + span - 1); + + /* If something on the lhs is below 256, and something on the + * rhs is above, there is a potential mapping here across that + * boundary. Indeed the only way there isn't is if both sides + * start at the same point. That means they both cross at the + * same time. But otherwise one crosses before the other */ + if (t_cp < 256 && r_cp_end > 255 && r_cp != t_cp) { + can_force_utf8 = TRUE; + } + } - /* now see if we need more "r" chars */ - if (rfirst > rlast) { - if (r < rend) { - rfirst = (I32)utf8n_to_uvchr(r, rend - r, &ulen, flags); - r += ulen; - if (r < rend && *r == RANGE_INDICATOR) { /* illegal utf8 val indicates range */ - r++; - rlast = (I32)utf8n_to_uvchr(r, rend - r, &ulen, flags); - r += ulen; - } - else - rlast = rfirst; - } - else { - if (!havefinal++) - final = rlast; - rfirst = rlast = 0xffffffff; - } - } + /* If a character appears in the search list more than once, the + * 2nd and succeeding occurrences are ignored, so only do this + * range if haven't already processed this character. (The range + * has been set up so that all members in it will be of the same + * ilk) */ + if (r_map[i] == TR_UNLISTED) { + + /* This is the first definition for this chunk, hence is valid + * and needs to be processed. Here and in the comments below, + * we use the above sample data. The t_cp chunk must be any + * contiguous subset of M, N, O, P, and/or Q. + * + * In the first pass, the t_invlist has been partitioned so + * that all elements in any single range have the same number + * of bytes in their UTF-8 representations. And the r space is + * either a single byte, or a range of strictly monotonically + * increasing code points. So the final element in the range + * will be represented by no fewer bytes than the initial one. + * That means that if the final code point in the t range has + * at least as many bytes as the final code point in the r, + * then all code points in the t range have at least as many + * bytes as their corresponding r range element. But if that's + * not true, the transliteration of at least the final code + * point grows in length. As an example, suppose we had + * tr/\x{fff0}-\x{fff1}/\x{ffff}-\x{10000}/ + * The UTF-8 for all but 10000 occupies 3 bytes on ASCII + * platforms. We have deliberately set up the data structure + * so that any range in the lhs gets split into chunks for + * processing, such that every code point in a chunk has the + * same number of UTF-8 bytes. We only have to check the final + * code point in the rhs against any code point in the lhs. */ + if ( ! pass2 + && r_cp_end != TR_SPECIAL_HANDLING + && UVCHR_SKIP(t_cp_end) < UVCHR_SKIP(r_cp_end)) + { + NV ratio = UVCHR_SKIP(r_cp_end) / UVCHR_SKIP(t_cp); - /* now see which range will peter out first, if either. */ - tdiff = tlast - tfirst; - rdiff = rlast - rfirst; - tcount += tdiff + 1; - rcount += rdiff + 1; + o->op_private |= OPpTRANS_GROWS; - if (tdiff <= rdiff) - diff = tdiff; - else - diff = rdiff; + /* Now that we know it grows, we can keep track of the + * largest ratio */ + if (ratio > max_expansion) { + max_expansion = ratio; + } + } - if (rfirst == 0xffffffff) { - diff = tdiff; /* oops, pretend rdiff is infinite */ - if (diff > 0) - Perl_sv_catpvf(aTHX_ listsv, "%04lx\t%04lx\tXXXX\n", - (long)tfirst, (long)tlast); - else - Perl_sv_catpvf(aTHX_ listsv, "%04lx\t\tXXXX\n", (long)tfirst); - } - else { - if (diff > 0) - Perl_sv_catpvf(aTHX_ listsv, "%04lx\t%04lx\t%04lx\n", - (long)tfirst, (long)(tfirst + diff), - (long)rfirst); - else - Perl_sv_catpvf(aTHX_ listsv, "%04lx\t\t%04lx\n", - (long)tfirst, (long)rfirst); - - if (rfirst + diff > max) - max = rfirst + diff; - if (!grows) - grows = (tfirst < rfirst && - UVCHR_SKIP(tfirst) < UVCHR_SKIP(rfirst + diff)); - rfirst += diff + 1; - } - tfirst += diff + 1; - } + /* The very first range is marked as adjacent to the + * non-existent range below it, as it causes things to "just + * work" (TradeMark) + * + * If the lowest code point in this chunk is M, it adjoins the + * J-L range */ + if (t_cp == t_array[i]) { + adjacent_to_range_below = TRUE; + + /* And if the map has the same offset from the beginning of + * the range as does this new code point (or both are for + * TR_SPECIAL_HANDLING), this chunk can be completely + * merged with the range below. EXCEPT, in the first pass, + * we don't merge ranges whose UTF-8 byte representations + * have different lengths, so that we can more easily + * detect if a replacement is longer than the source, that + * is if it 'grows'. But in the 2nd pass, there's no + * reason to not merge */ + if ( (i > 0 && ( pass2 + || UVCHR_SKIP(t_array[i-1]) + == UVCHR_SKIP(t_cp))) + && ( ( r_cp == TR_SPECIAL_HANDLING + && r_map[i-1] == TR_SPECIAL_HANDLING) + || ( r_cp != TR_SPECIAL_HANDLING + && r_cp - r_map[i-1] == t_cp - t_array[i-1]))) + { + merge_with_range_below = TRUE; + } + } - /* compile listsv into a swash and attach to o */ + /* Similarly, if the highest code point in this chunk is 'Q', + * it adjoins the range above, and if the map is suitable, can + * be merged with it */ + if ( t_cp_end >= IV_MAX - 1 + || ( (Size_t) i + 1 < len + && (Size_t) t_cp_end + 1 == t_array[i+1])) + { + adjacent_to_range_above = TRUE; + if ((Size_t) i + 1 < len) + if ( ( pass2 + || UVCHR_SKIP(t_cp) == UVCHR_SKIP(t_array[i+1])) + && ( ( r_cp == TR_SPECIAL_HANDLING + && r_map[i+1] == (UV) TR_SPECIAL_HANDLING) + || ( r_cp != TR_SPECIAL_HANDLING + && r_cp_end == r_map[i+1] - 1))) + { + merge_with_range_above = TRUE; + } + } - none = ++max; - if (del) - ++max; + if (merge_with_range_below && merge_with_range_above) { + + /* Here the new chunk looks like M => m, ... Q => q; and + * the range above is like R => r, .... Thus, the [i-1] + * and [i+1] ranges should be seamlessly melded so the + * result looks like + * + * [i-1] J j # J-T => j-t + * [i] U y # U => y, V => y+1, ... + * ... + * [-1] Z -1 # Z => default; as do Z+1, ... infinity + */ + Move(t_array + i + 2, t_array + i, len - i - 2, UV); + Move(r_map + i + 2, r_map + i, len - i - 2, UV); + len -= 2; + invlist_set_len(t_invlist, + len, + *(get_invlist_offset_addr(t_invlist))); + } + else if (merge_with_range_below) { + + /* Here the new chunk looks like M => m, .... But either + * (or both) it doesn't extend all the way up through Q; or + * the range above doesn't start with R => r. */ + if (! adjacent_to_range_above) { + + /* In the first case, let's say the new chunk extends + * through O. We then want: + * + * [i-1] J j # J-O => j-o + * [i] P -1 # P => -1, Q => -1 + * [i+1] R x # R => x, S => x+1, T => x+2 + * [i+2] U y # U => y, V => y+1, ... + * ... + * [-1] Z -1 # Z => default; as do Z+1, ... + * infinity + */ + t_array[i] = t_cp_end + 1; + r_map[i] = TR_UNLISTED; + } + else { /* Adjoins the range above, but can't merge with it + (because 'x' is not the next map after q) */ + /* + * [i-1] J j # J-Q => j-q + * [i] R x # R => x, S => x+1, T => x+2 + * [i+1] U y # U => y, V => y+1, ... + * ... + * [-1] Z -1 # Z => default; as do Z+1, ... + * infinity + */ - if (max > 0xffff) - bits = 32; - else if (max > 0xff) - bits = 16; - else - bits = 8; + Move(t_array + i + 1, t_array + i, len - i - 1, UV); + Move(r_map + i + 1, r_map + i, len - i - 1, UV); + len--; + invlist_set_len(t_invlist, len, + *(get_invlist_offset_addr(t_invlist))); + } + } + else if (merge_with_range_above) { + + /* Here the new chunk ends with Q => q, and the range above + * must start with R => r, so the two can be merged. But + * either (or both) the new chunk doesn't extend all the + * way down to M; or the mapping of the final code point + * range below isn't m */ + if (! adjacent_to_range_below) { + + /* In the first case, let's assume the new chunk starts + * with P => p. Then, because it's merge-able with the + * range above, that range must be R => r. We want: + * + * [i-1] J j # J-L => j-l + * [i] M -1 # M => -1, N => -1 + * [i+1] P p # P-T => p-t + * [i+2] U y # U => y, V => y+1, ... + * ... + * [-1] Z -1 # Z => default; as do Z+1, ... + * infinity + */ + t_array[i+1] = t_cp; + r_map[i+1] = r_cp; + } + else { /* Adjoins the range below, but can't merge with it + */ + /* + * [i-1] J j # J-L => j-l + * [i] M x # M-T => x-5 .. x+2 + * [i+1] U y # U => y, V => y+1, ... + * ... + * [-1] Z -1 # Z => default; as do Z+1, ... + * infinity + */ + Move(t_array + i + 1, t_array + i, len - i - 1, UV); + Move(r_map + i + 1, r_map + i, len - i - 1, UV); + len--; + t_array[i] = t_cp; + r_map[i] = r_cp; + invlist_set_len(t_invlist, len, + *(get_invlist_offset_addr(t_invlist))); + } + } + else if (adjacent_to_range_below && adjacent_to_range_above) { + /* The new chunk completely fills the gap between the + * ranges on either side, but can't merge with either of + * them. + * + * [i-1] J j # J-L => j-l + * [i] M z # M => z, N => z+1 ... Q => z+4 + * [i+1] R x # R => x, S => x+1, T => x+2 + * [i+2] U y # U => y, V => y+1, ... + * ... + * [-1] Z -1 # Z => default; as do Z+1, ... infinity + */ + r_map[i] = r_cp; + } + else if (adjacent_to_range_below) { + /* The new chunk adjoins the range below, but not the range + * above, and can't merge. Let's assume the chunk ends at + * O. + * + * [i-1] J j # J-L => j-l + * [i] M z # M => z, N => z+1, O => z+2 + * [i+1] P -1 # P => -1, Q => -1 + * [i+2] R x # R => x, S => x+1, T => x+2 + * [i+3] U y # U => y, V => y+1, ... + * ... + * [-w] Z -1 # Z => default; as do Z+1, ... infinity + */ + invlist_extend(t_invlist, len + 1); + t_array = invlist_array(t_invlist); + Renew(r_map, len + 1, UV); + + Move(t_array + i + 1, t_array + i + 2, len - i - 1, UV); + Move(r_map + i + 1, r_map + i + 2, len - i - 1, UV); + r_map[i] = r_cp; + t_array[i+1] = t_cp_end + 1; + r_map[i+1] = TR_UNLISTED; + len++; + invlist_set_len(t_invlist, len, + *(get_invlist_offset_addr(t_invlist))); + } + else if (adjacent_to_range_above) { + /* The new chunk adjoins the range above, but not the range + * below, and can't merge. Let's assume the new chunk + * starts at O + * + * [i-1] J j # J-L => j-l + * [i] M -1 # M => default, N => default + * [i+1] O z # O => z, P => z+1, Q => z+2 + * [i+2] R x # R => x, S => x+1, T => x+2 + * [i+3] U y # U => y, V => y+1, ... + * ... + * [-1] Z -1 # Z => default; as do Z+1, ... infinity + */ + invlist_extend(t_invlist, len + 1); + t_array = invlist_array(t_invlist); + Renew(r_map, len + 1, UV); + + Move(t_array + i + 1, t_array + i + 2, len - i - 1, UV); + Move(r_map + i + 1, r_map + i + 2, len - i - 1, UV); + t_array[i+1] = t_cp; + r_map[i+1] = r_cp; + len++; + invlist_set_len(t_invlist, len, + *(get_invlist_offset_addr(t_invlist))); + } + else { + /* The new chunk adjoins neither the range above, nor the + * range below. Lets assume it is N..P => n..p + * + * [i-1] J j # J-L => j-l + * [i] M -1 # M => default + * [i+1] N n # N..P => n..p + * [i+2] Q -1 # Q => default + * [i+3] R x # R => x, S => x+1, T => x+2 + * [i+4] U y # U => y, V => y+1, ... + * ... + * [-1] Z -1 # Z => default; as do Z+1, ... infinity + */ - swash = MUTABLE_SV(swash_init("utf8", "", listsv, bits, none)); -#ifdef USE_ITHREADS - cPADOPo->op_padix = pad_alloc(OP_TRANS, SVf_READONLY); - SvREFCNT_dec(PAD_SVl(cPADOPo->op_padix)); - PAD_SETSV(cPADOPo->op_padix, swash); - SvPADTMP_on(swash); - SvREADONLY_on(swash); -#else - cSVOPo->op_sv = swash; -#endif - SvREFCNT_dec(listsv); - SvREFCNT_dec(transv); + invlist_extend(t_invlist, len + 2); + t_array = invlist_array(t_invlist); + Renew(r_map, len + 2, UV); + + Move(t_array + i + 1, + t_array + i + 2 + 1, len - i - (2 - 1), UV); + Move(r_map + i + 1, + r_map + i + 2 + 1, len - i - (2 - 1), UV); + + len += 2; + invlist_set_len(t_invlist, len, + *(get_invlist_offset_addr(t_invlist))); + + t_array[i+1] = t_cp; + r_map[i+1] = r_cp; + + t_array[i+2] = t_cp_end + 1; + r_map[i+2] = TR_UNLISTED; + } + } /* End of this chunk needs to be processed */ + + /* Done with this chunk. */ + t_cp += span; + if (t_cp >= IV_MAX) { + break; + } + t_range_count -= span; + if (r_cp != TR_SPECIAL_HANDLING) { + r_cp += span; + r_range_count -= span; + } + else { + r_range_count = 0; + } + + } /* End of loop through the search list */ + + /* We don't need an exact count, but we do need to know if there is + * anything left over in the replacement list. So, just assume it's + * one byte per character */ + if (rend > r) { + r_count++; + } + } /* End of passes */ + + SvREFCNT_dec(inverted_tstr); - if (!del && havefinal && rlen) - (void)hv_store(MUTABLE_HV(SvRV(swash)), "FINAL", 5, - newSVuv((UV)final), 0); + /* We now have normalized the input into an inversion map. + * + * See if the lhs and rhs are equivalent. If so, this tr/// is a no-op + * except for the count, and streamlined runtime code can be used */ + if (!del && !squash) { + + /* They are identical if they point to same address, or if everything + * maps to UNLISTED or to itself. This catches things that not looking + * at the normalized inversion map doesn't catch, like tr/aa/ab/ or + * tr/\x{100}-\x{104}/\x{100}-\x{102}\x{103}-\x{104} */ + if (r0 != t0) { + for (i = 0; i < len; i++) { + if (r_map[i] != TR_UNLISTED && r_map[i] != t_array[i]) { + goto done_identical_check; + } + } + } + + /* Here have gone through entire list, and didn't find any + * non-identical mappings */ + o->op_private |= OPpTRANS_IDENTICAL; + + done_identical_check: ; + } + + t_array = invlist_array(t_invlist); + + /* If has components above 255, we generally need to use the inversion map + * implementation */ + if ( can_force_utf8 + || ( len > 0 + && t_array[len-1] > 255 + /* If the final range is 0x100-INFINITY and is a special + * mapping, the table implementation can handle it */ + && ! ( t_array[len-1] == 256 + && ( r_map[len-1] == TR_UNLISTED + || r_map[len-1] == TR_SPECIAL_HANDLING)))) + { + SV* r_map_sv; + + /* A UTF-8 op is generated, indicated by this flag. This op is an + * sv_op */ + o->op_private |= OPpTRANS_USE_SVOP; + + if (can_force_utf8) { + o->op_private |= OPpTRANS_CAN_FORCE_UTF8; + } + + /* The inversion map is pushed; first the list. */ + invmap = MUTABLE_AV(newAV()); + av_push(invmap, t_invlist); - Safefree(tsave); - Safefree(rsave); + /* 2nd is the mapping */ + r_map_sv = newSVpvn((char *) r_map, len * sizeof(UV)); + av_push(invmap, r_map_sv); - tlen = tcount; - rlen = rcount; - if (r < rend) - rlen++; - else if (rlast == 0xffffffff) - rlen = 0; + /* 3rd is the max possible expansion factor */ + av_push(invmap, newSVnv(max_expansion)); + + /* Characters that are in the search list, but not in the replacement + * list are mapped to the final character in the replacement list */ + if (! del && r_count < t_count) { + av_push(invmap, newSVuv(final_map)); + } + +#ifdef USE_ITHREADS + cPADOPo->op_padix = pad_alloc(OP_TRANS, SVf_READONLY); + SvREFCNT_dec(PAD_SVl(cPADOPo->op_padix)); + PAD_SETSV(cPADOPo->op_padix, (SV *) invmap); + SvPADTMP_on(invmap); + SvREADONLY_on(invmap); +#else + cSVOPo->op_sv = (SV *) invmap; +#endif - goto warnins; } + else { + OPtrans_map *tbl; + Size_t i; + + /* The OPtrans_map struct already contains one slot; hence the -1. */ + SSize_t struct_size = sizeof(OPtrans_map) + + (256 - 1 + 1)*sizeof(short); /* Non-utf8 case: set o->op_pv to point to a simple 256+ entry lookup * table. Entries with the value TR_UNMAPPED indicate chars not to be * translated, while TR_DELETE indicates a search char without a * corresponding replacement char under /d. * - * Normally, the table has 256 slots. However, in the presence of - * /c, the search charlist has an implicit \x{100}-\x{7fffffff} - * added, and if there are enough replacement chars to start pairing - * with the \x{100},... search chars, then a larger (> 256) table - * is allocated. - * - * In addition, regardless of whether under /c, an extra slot at the - * end is used to store the final repeating char, or TR_R_EMPTY under an - * empty replacement list, or TR_DELETE under /d; which makes the - * runtime code easier. - * - * The toker will have already expanded char ranges in t and r. + * In addition, an extra slot at the end is used to store the final + * repeating char, or TR_R_EMPTY under an empty replacement list, or + * TR_DELETE under /d; which makes the runtime code easier. */ - /* Initially allocate 257-slot table: 256 for basic (non /c) usage, - * plus final slot for repeat/TR_DELETE/TR_R_EMPTY. Later we realloc if - * excess > * 0. The OPtrans_map struct already contains one slot; - * hence the -1. - */ - struct_size = sizeof(OPtrans_map) + (256 - 1 + 1)*sizeof(short); + /* Indicate this is an op_pv */ + o->op_private &= ~OPpTRANS_USE_SVOP; + tbl = (OPtrans_map*)PerlMemShared_calloc(struct_size, 1); tbl->size = 256; cPVOPo->op_pv = (char*)tbl; - if (complement) { - Size_t excess; + for (i = 0; i < len; i++) { + STATIC_ASSERT_DECL(TR_SPECIAL_HANDLING == TR_DELETE); + short upper = i >= len - 1 ? 256 : t_array[i+1]; + short to = r_map[i]; + short j; + bool do_increment = TRUE; - /* in this branch, j is a count of 'consumed' (i.e. paired off - * with a search char) replacement chars (so j <= rlen always) - */ - for (i = 0; i < tlen; i++) - tbl->map[t[i]] = (short) TR_UNMAPPED; - - for (i = 0, j = 0; i < 256; i++) { - if (!tbl->map[i]) { - if (j == rlen) { - if (del) - tbl->map[i] = (short) TR_DELETE; - else if (rlen) - tbl->map[i] = r[j-1]; - else - tbl->map[i] = (short)i; - } - else { - tbl->map[i] = r[j++]; - } - if ( tbl->map[i] >= 0 - && UVCHR_IS_INVARIANT((UV)i) - && !UVCHR_IS_INVARIANT((UV)(tbl->map[i])) - ) - grows = TRUE; - } + /* Any code points above our limit should be irrelevant */ + if (t_array[i] >= tbl->size) break; + + /* Set up the map */ + if (to == (short) TR_SPECIAL_HANDLING && ! del) { + to = final_map; + do_increment = FALSE; + } + else if (to < 0) { + do_increment = FALSE; } - ASSUME(j <= rlen); - excess = rlen - j; + /* Create a map for everything in this range. The value increases + * except for the special cases */ + for (j = t_array[i]; j < upper; j++) { + tbl->map[j] = to; + if (do_increment) to++; + } + } - if (excess) { - /* More replacement chars than search chars: - * store excess replacement chars at end of main table. - */ + tbl->map[tbl->size] = del + ? (short) TR_DELETE + : rlen + ? final_map + : (short) TR_R_EMPTY; - struct_size += excess * sizeof(short); - tbl = (OPtrans_map*)PerlMemShared_realloc(tbl, struct_size); - tbl->size += excess; - cPVOPo->op_pv = (char*)tbl; + SvREFCNT_dec(t_invlist); - for (i = 0; i < excess; i++) - tbl->map[i + 256] = r[j+i]; - } - else { - /* no more replacement chars than search chars */ - if (!rlen && !del && !squash) - o->op_private |= OPpTRANS_IDENTICAL; - } +#if 0 /* code that added excess above-255 chars at the end of the table, in + case we ever want to not use the inversion map implementation for + this */ - tbl->map[tbl->size] = del - ? (short) TR_DELETE - : rlen - ? r[rlen - 1] - : (short) TR_R_EMPTY; + ASSUME(j <= rlen); + excess = rlen - j; + + if (excess) { + /* More replacement chars than search chars: + * store excess replacement chars at end of main table. + */ + + struct_size += excess; + tbl = (OPtrans_map*)PerlMemShared_realloc(tbl, + struct_size + excess * sizeof(short)); + tbl->size += excess; + cPVOPo->op_pv = (char*)tbl; + + for (i = 0; i < excess; i++) + tbl->map[i + 256] = r[j+i]; } else { - if (!rlen && !del) { - r = t; rlen = tlen; - if (!squash) - o->op_private |= OPpTRANS_IDENTICAL; - } - else if (!squash && rlen == tlen && memEQ((char*)t, (char*)r, tlen)) { - o->op_private |= OPpTRANS_IDENTICAL; - } + /* no more replacement chars than search chars */ +#endif - for (i = 0; i < 256; i++) - tbl->map[i] = (short) TR_UNMAPPED; - for (i = 0, j = 0; i < tlen; i++,j++) { - if (j >= rlen) { - if (del) { - if (tbl->map[t[i]] == (short) TR_UNMAPPED) - tbl->map[t[i]] = (short) TR_DELETE; - continue; - } - --j; - } - if (tbl->map[t[i]] == (short) TR_UNMAPPED) { - if ( UVCHR_IS_INVARIANT(t[i]) - && ! UVCHR_IS_INVARIANT(r[j])) - grows = TRUE; - tbl->map[t[i]] = r[j]; - } - } - tbl->map[tbl->size] = del - ? (short) TR_UNMAPPED - : rlen - ? (short) TR_UNMAPPED - : (short) TR_R_EMPTY; - } + } - /* both non-utf8 and utf8 code paths end up here */ + Safefree(r_map); - warnins: - if(del && rlen == tlen) { + if(del && rlen != 0 && r_count == t_count) { Perl_ck_warner(aTHX_ packWARN(WARN_MISC), "Useless use of /d modifier in transliteration operator"); - } else if(rlen > tlen && !complement) { + } else if(r_count > t_count) { Perl_ck_warner(aTHX_ packWARN(WARN_MISC), "Replacement list is longer than search list"); } - if (grows) - o->op_private |= OPpTRANS_GROWS; op_free(expr); op_free(repl); @@ -1110,7 +1110,7 @@ C<sib> is non-null. For a higher-level interface, see C<L</op_sibling_splice>>. "Use of strings with code points over 0xFF as arguments to " \ "%s operator is not allowed" #endif -#if defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C) +#if defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C) || defined(PERL_IN_PERL_C) # define TR_UNMAPPED (UV)-1 # define TR_DELETE (UV)-2 # define TR_R_EMPTY (UV)-3 /* rhs (replacement) is empty */ @@ -4859,31 +4859,26 @@ STATIC Size_t S_do_trans_complex(pTHX_ SV * const sv, const OPtrans_map * const #define PERL_ARGS_ASSERT_DO_TRANS_COMPLEX \ assert(sv); assert(tbl) -STATIC Size_t S_do_trans_complex_utf8(pTHX_ SV * const sv) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_DO_TRANS_COMPLEX_UTF8 \ - assert(sv) - STATIC Size_t S_do_trans_count(pTHX_ SV * const sv, const OPtrans_map * const tbl) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_DO_TRANS_COUNT \ assert(sv); assert(tbl) -STATIC Size_t S_do_trans_count_utf8(pTHX_ SV * const sv) +STATIC Size_t S_do_trans_count_invmap(pTHX_ SV * const sv, AV * const map) __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_DO_TRANS_COUNT_UTF8 \ - assert(sv) +#define PERL_ARGS_ASSERT_DO_TRANS_COUNT_INVMAP \ + assert(sv); assert(map) + +STATIC Size_t S_do_trans_invmap(pTHX_ SV * const sv, AV * const map) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_DO_TRANS_INVMAP \ + assert(sv); assert(map) STATIC Size_t S_do_trans_simple(pTHX_ SV * const sv, const OPtrans_map * const tbl) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_DO_TRANS_SIMPLE \ assert(sv); assert(tbl) -STATIC Size_t S_do_trans_simple_utf8(pTHX_ SV * const sv) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_DO_TRANS_SIMPLE_UTF8 \ - assert(sv) - #endif #if defined(PERL_IN_DUMP_C) STATIC CV* S_deb_curcv(pTHX_ I32 ix); @@ -5540,12 +5535,6 @@ STATIC SV * S_space_join_names_mortal(pTHX_ char *const *array); STATIC void S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist); #define PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS \ assert(pRExC_state); assert(invlist) -#ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE SV* S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_ADD_CP_TO_INVLIST -#endif - STATIC U32 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_ADD_DATA \ @@ -5582,13 +5571,6 @@ STATIC SV * S_get_ANYOFM_contents(pTHX_ const regnode * n) STATIC SV* S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, const regnode_charclass* const node); #define PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC \ assert(pRExC_state); assert(node) -#ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE STRLEN* S_get_invlist_iter_addr(SV* invlist) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR \ - assert(invlist) -#endif - STATIC bool S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode_offset* nodep, UV *code_point_p, int* cp_count, I32 *flagp, const bool strict, const U32 depth); #define PERL_ARGS_ASSERT_GROK_BSLASH_N \ assert(pRExC_state); assert(flagp) @@ -5614,46 +5596,12 @@ PERL_STATIC_INLINE SV* S_invlist_contents(pTHX_ SV* const invlist, const bool tr #endif #ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE void S_invlist_extend(pTHX_ SV* const invlist, const UV len); -#define PERL_ARGS_ASSERT_INVLIST_EXTEND \ - assert(invlist) -#endif -#ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE UV S_invlist_highest(SV* const invlist) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_INVLIST_HIGHEST \ - assert(invlist) -#endif - -#ifndef PERL_NO_INLINE_FUNCTIONS PERL_STATIC_INLINE bool S_invlist_is_iterating(SV* const invlist) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_INVLIST_IS_ITERATING \ assert(invlist) #endif -#ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE void S_invlist_iterfinish(SV* invlist); -#define PERL_ARGS_ASSERT_INVLIST_ITERFINISH \ - assert(invlist) -#endif -#ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE void S_invlist_iterinit(SV* invlist); -#define PERL_ARGS_ASSERT_INVLIST_ITERINIT \ - assert(invlist) -#endif -#ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE bool S_invlist_iternext(SV* invlist, UV* start, UV* end) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_INVLIST_ITERNEXT \ - assert(invlist); assert(start); assert(end) -#endif - -#ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE void S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset); -#define PERL_ARGS_ASSERT_INVLIST_SET_LEN \ - assert(invlist) -#endif STATIC bool S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc); #define PERL_ARGS_ASSERT_IS_SSC_WORTH_IT \ assert(pRExC_state); assert(ssc) @@ -5811,6 +5759,55 @@ PERL_CALLCONV void Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char* #define PERL_ARGS_ASSERT__INVLIST_DUMP \ assert(file); assert(indent); assert(invlist) #endif +#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C) +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE SV* S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_ADD_CP_TO_INVLIST +#endif + +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE STRLEN* S_get_invlist_iter_addr(SV* invlist) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR \ + assert(invlist) +#endif + +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE void S_invlist_extend(pTHX_ SV* const invlist, const UV len); +#define PERL_ARGS_ASSERT_INVLIST_EXTEND \ + assert(invlist) +#endif +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE UV S_invlist_highest(SV* const invlist) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_INVLIST_HIGHEST \ + assert(invlist) +#endif + +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE void S_invlist_iterfinish(SV* invlist); +#define PERL_ARGS_ASSERT_INVLIST_ITERFINISH \ + assert(invlist) +#endif +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE void S_invlist_iterinit(SV* invlist); +#define PERL_ARGS_ASSERT_INVLIST_ITERINIT \ + assert(invlist) +#endif +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE bool S_invlist_iternext(SV* invlist, UV* start, UV* end) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_INVLIST_ITERNEXT \ + assert(invlist); assert(start); assert(end) +#endif + +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE void S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset); +#define PERL_ARGS_ASSERT_INVLIST_SET_LEN \ + assert(invlist) +#endif +#endif #if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_PERL_C) || defined(PERL_IN_UTF8_C) PERL_CALLCONV bool Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b); #define PERL_ARGS_ASSERT__INVLISTEQ \ @@ -5832,7 +5829,7 @@ PERL_CALLCONV void Perl_regprop(pTHX_ const regexp *prog, SV* sv, const regnode* #define PERL_ARGS_ASSERT_REGPROP \ assert(sv); assert(o) #endif -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C) +#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C) #ifndef PERL_NO_INLINE_FUNCTIONS PERL_STATIC_INLINE bool S__invlist_contains_cp(SV* const invlist, const UV cp) __attribute__warn_unused_result__; @@ -2905,12 +2905,12 @@ S_scan_const(pTHX_ char *start) bool dorange = FALSE; /* are we in a translit range? */ bool didrange = FALSE; /* did we just finish a range? */ bool in_charclass = FALSE; /* within /[...]/ */ - bool d_is_utf8 = FALSE; /* Output constant is UTF8 */ bool s_is_utf8 = cBOOL(UTF); /* Is the source string assumed to be UTF8? But, this can show as true when the source isn't utf8, as for example when it is entirely composed of hex constants */ + bool d_is_utf8 = FALSE; /* Output constant is UTF8 */ STRLEN utf8_variant_count = 0; /* When not in UTF-8, this counts the number of characters found so far that will expand (into 2 bytes) @@ -2951,11 +2951,6 @@ S_scan_const(pTHX_ char *start) PERL_ARGS_ASSERT_SCAN_CONST; assert(PL_lex_inwhat != OP_TRANSR); - if (PL_lex_inwhat == OP_TRANS && PL_parser->lex_sub_op) { - /* If we are doing a trans and we know we want UTF8, set expectation */ - d_is_utf8 = PL_parser->lex_sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF); - s_is_utf8 = PL_parser->lex_sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); - } /* Protect sv from errors and fatal warnings. */ ENTER_with_name("scan_const"); @@ -3646,13 +3641,6 @@ S_scan_const(pTHX_ char *start) } d = (char*)uvchr_to_utf8((U8*)d, uv); - if (PL_lex_inwhat == OP_TRANS - && PL_parser->lex_sub_op) - { - PL_parser->lex_sub_op->op_private |= - (PL_lex_repl ? OPpTRANS_FROM_UTF - : OPpTRANS_TO_UTF); - } } } #ifdef EBCDIC @@ -4133,10 +4121,6 @@ S_scan_const(pTHX_ char *start) SvPOK_on(sv); if (d_is_utf8) { SvUTF8_on(sv); - if (PL_lex_inwhat == OP_TRANS && PL_parser->lex_sub_op) { - PL_parser->lex_sub_op->op_private |= - (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); - } } /* shrink the sv if we allocated more than we used */ @@ -10297,9 +10281,7 @@ S_scan_trans(pTHX_ char *start) o = newPVOP(nondestruct ? OP_TRANSR : OP_TRANS, 0, (char*)NULL); o->op_private &= ~OPpTRANS_ALL; - o->op_private |= del|squash|complement| - (DO_UTF8(PL_lex_stuff)? OPpTRANS_FROM_UTF : 0)| - (DO_UTF8(PL_parser->lex_sub_repl) ? OPpTRANS_TO_UTF : 0); + o->op_private |= del|squash|complement; PL_lex_op = o; pl_yylval.ival = nondestruct ? OP_TRANSR : OP_TRANS; |