summaryrefslogtreecommitdiff
path: root/doop.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2019-11-04 21:30:48 -0700
committerKarl Williamson <khw@cpan.org>2019-11-06 21:22:24 -0700
commitf34acfecc286f2eff2450db713da005d888a7317 (patch)
tree48c7a2dcb411f0ed37216761f759365c82c6f243 /doop.c
parent8c90d3a9c79a9471ef12dde584263fc38571cf46 (diff)
downloadperl-f34acfecc286f2eff2450db713da005d888a7317.tar.gz
Reimplement tr/// without swashes
This large commit removes the last use of swashes from core. It replaces swashes by inversion maps. This data structure is already in use for some Unicode properties, such as case changing. The inversion map data structure leads to straight forward implementation code, so I collapsed the two doop.c routines do_trans_complex_utf8() and do_trans_simple_utf8() into one. A few conditionals could be avoided in the loop if this function were split so that one version didn't have to test for, e.g., squashing, but I suspect these are in the noise in the loop, which has to deal with UTF-8 conversions. This should be faster than the previous implementation anyway. I measured the differences some releases back, and inversion maps were faster than the equivalent swash for up to 512 or 1024 different ranges. These numbers are unlikely to be exceeded in tr/// except possibly in machine-generated ones. Inversion maps are capable of handling both UTF-8 and non-UTF-8 cases, but I left in the existing non-UTF-8 implementation, which uses tables, because I suspect it is faster. This means that there is extra code, purely for runtime performance. An inversion map is always created from the input, and then if the table implementation is to be used, the table is easily derived from the map. Prior to this commit, the table implementation was used in certain edge cases involving code points above 255. Those cases are now handled by the inversion map implementation, because it would have taken extra code to detect them, and I didn't think it was worth it. That could be changed if I am wrong. Creating an inversion map for all inputs essentially normalizes them, and then the same logic is usable for all. This fixes some false negatives in the previous implementation. It also allows for detecting if the actual transliteration can be done in place. Previously, the code mostly punted on that detection for the UTF-8 case. This also allows for accurate counting of the lengths of the two sides, fixing some longstanding TODO warning tests. A new flag is created, OPpTRANS_CAN_FORCE_UTF8, when the tr/// has a below 256 character resolving to one that requires UTF-8. If this isn't set, the code knows that a non-UTF-8 input won't become UTF-8 in the process, and so can take short cuts. The bit representing this flag is the same as OPpTRANS_FROM_UTF, which is no longer used. That name is left in so that the dozen-ish modules in cpan that refer to it can still compile. AFAICT none of them actually use the flag, as well they shouldn't since it is private to the core. Inversion maps are ideally suited for tr/// implementations. An issue with them in general is that for some pathological data, they can become fragmented requiring more space than you would expect, to represent the underlying data. However, the typical tr/// would not have this issue, requiring only very short inversion maps to represent; in some cases shorter than the table implementation. Inversion maps are also easier to deparse than swashes. A deparse TODO was also fixed by this commit, and the code to deparse UTF-8 inputs is simplified. One could implement specialized data structures for specific types of inputs. For example, a common tr/// form is a single range, like tr/A-Z/a-z/. That could be implemented without a table and be quite fast. An intermediate step would be to use the inversion map implementation always when the transliteration is a single range, and then special case length=1 maps at execution time. Thanks to Nicholas Rochemagne for his help on B
Diffstat (limited to 'doop.c')
-rw-r--r--doop.c506
1 files changed, 219 insertions, 287 deletions
diff --git a/doop.c b/doop.c
index e0d63f13d1..3cb1354540 100644
--- a/doop.c
+++ b/doop.c
@@ -22,6 +22,7 @@
#include "EXTERN.h"
#define PERL_IN_DOOP_C
#include "perl.h"
+#include "invlist_inline.h"
#ifndef PERL_MICRO
#include <signal.h>
@@ -297,328 +298,240 @@ S_do_trans_complex(pTHX_ SV * const sv, const OPtrans_map * const tbl)
/* Helper function for do_trans().
- * Handles utf8 cases(*) not involving the /c, /d, /s flags,
- * and where search and replacement charlists aren't identical.
- * (*) i.e. where the search or replacement charlists are utf8. sv may
- * or may not be utf8.
+ * Handles cases where an inversion map implementation is to be used and the
+ * search and replacement charlists are identical: so the string isn't
+ * modified, and only a count of modifiable chars is needed.
+ *
+ * Note that it doesn't handle /d nor /s, since these modify the string
+ * even if the replacement charlist is empty.
+ *
+ * sv may or may not be utf8.
*/
STATIC Size_t
-S_do_trans_simple_utf8(pTHX_ SV * const sv)
+S_do_trans_count_invmap(pTHX_ SV * const sv, AV * const invmap)
{
U8 *s;
U8 *send;
- U8 *d;
- U8 *start;
- U8 *dstart, *dend;
Size_t matches = 0;
- const bool grows = cBOOL(PL_op->op_private & OPpTRANS_GROWS);
STRLEN len;
- SV* const rv =
-#ifdef USE_ITHREADS
- PAD_SVl(cPADOP->op_padix);
-#else
- MUTABLE_SV(cSVOP->op_sv);
-#endif
- HV* const hv = MUTABLE_HV(SvRV(rv));
- SV* const * svp = hv_fetchs(hv, "NONE", FALSE);
- const UV none = svp ? SvUV(*svp) : 0x7fffffff;
- const UV extra = none + 1;
- UV final = 0;
- U8 hibit = 0;
+ SV** const from_invlist_ptr = av_fetch(invmap, 0, TRUE);
+ SV** const to_invmap_ptr = av_fetch(invmap, 1, TRUE);
+ SV* from_invlist = *from_invlist_ptr;
+ SV* to_invmap_sv = *to_invmap_ptr;
+ UV* map = (UV *) SvPVX(to_invmap_sv);
- PERL_ARGS_ASSERT_DO_TRANS_SIMPLE_UTF8;
+ PERL_ARGS_ASSERT_DO_TRANS_COUNT_INVMAP;
s = (U8*)SvPV_nomg(sv, len);
- if (!SvUTF8(sv)) {
- hibit = ! is_utf8_invariant_string(s, len);
- if (hibit) {
- s = bytes_to_utf8(s, &len);
- }
- }
+
send = s + len;
- start = s;
- svp = hv_fetchs(hv, "FINAL", FALSE);
- if (svp)
- final = SvUV(*svp);
+ while (s < send) {
+ UV from;
+ SSize_t i;
+ STRLEN s_len;
+
+ /* Get the code point of the next character in the string */
+ if (! SvUTF8(sv) || UTF8_IS_INVARIANT(*s)) {
+ from = *s;
+ s_len = 1;
+ }
+ else {
+ from = utf8_to_uvchr_buf(s, send, &s_len);
+ if (from == 0 && *s != '\0') {
+ _force_out_malformed_utf8_message(s, send, 0, /*die*/TRUE);
+ }
+ }
- if (grows) {
- /* d needs to be bigger than s, in case e.g. upgrading is required */
- Newx(d, len * 3 + UTF8_MAXBYTES, U8);
- dend = d + len * 3;
- dstart = d;
- }
- else {
- dstart = d = s;
- dend = d + len;
- }
+ /* Look the code point up in the data structure for this tr/// to get
+ * what it maps to */
+ i = _invlist_search(from_invlist, from);
+ assert(i >= 0);
- while (s < send) {
- const UV uv = swash_fetch(rv, s, TRUE);
- if (uv < none) {
- s += UTF8SKIP(s);
- matches++;
- d = uvchr_to_utf8(d, uv);
- }
- else if (uv == none) {
- const int i = UTF8SKIP(s);
- Move(s, d, i, U8);
- d += i;
- s += i;
- }
- else if (uv == extra) {
- s += UTF8SKIP(s);
- matches++;
- d = uvchr_to_utf8(d, final);
- }
- else
- s += UTF8SKIP(s);
-
- if (d > dend) {
- const STRLEN clen = d - dstart;
- const STRLEN nlen = dend - dstart + len + UTF8_MAXBYTES;
- if (!grows)
- Perl_croak(aTHX_ "panic: do_trans_simple_utf8 line %d",__LINE__);
- Renew(dstart, nlen + UTF8_MAXBYTES, U8);
- d = dstart + clen;
- dend = dstart + nlen;
- }
- }
- if (grows || hibit) {
- sv_setpvn(sv, (char*)dstart, d - dstart);
- Safefree(dstart);
- if (grows && hibit)
- Safefree(start);
- }
- else {
- *d = '\0';
- SvCUR_set(sv, d - dstart);
+ if (map[i] != (UV) TR_UNLISTED) {
+ matches++;
+ }
+
+ s += s_len;
}
- SvSETMAGIC(sv);
- SvUTF8_on(sv);
return matches;
}
/* Helper function for do_trans().
- * Handles utf8 cases(*) where search and replacement charlists are
- * identical: so the string isn't modified, and only a count of modifiable
- * chars is needed.
- * Note that it doesn't handle /d or /s, since these modify the string
- * even if the replacement charlist is empty.
- * (*) i.e. where the search or replacement charlists are utf8. sv may
- * or may not be utf8.
+ * Handles cases where an inversion map implementation is to be used and the
+ * search and replacement charlists are either not identical or flags are
+ * present.
+ *
+ * sv may or may not be utf8.
*/
STATIC Size_t
-S_do_trans_count_utf8(pTHX_ SV * const sv)
+S_do_trans_invmap(pTHX_ SV * const sv, AV * const invmap)
{
- const U8 *s;
- const U8 *start = NULL;
- const U8 *send;
+ U8 *s;
+ U8 *send;
+ U8 *d;
+ U8 *s0;
+ U8 *d0;
Size_t matches = 0;
STRLEN len;
- SV* const rv =
-#ifdef USE_ITHREADS
- PAD_SVl(cPADOP->op_padix);
-#else
- MUTABLE_SV(cSVOP->op_sv);
-#endif
- HV* const hv = MUTABLE_HV(SvRV(rv));
- SV* const * const svp = hv_fetchs(hv, "NONE", FALSE);
- const UV none = svp ? SvUV(*svp) : 0x7fffffff;
- const UV extra = none + 1;
- U8 hibit = 0;
-
- PERL_ARGS_ASSERT_DO_TRANS_COUNT_UTF8;
+ SV** const from_invlist_ptr = av_fetch(invmap, 0, TRUE);
+ SV** const to_invmap_ptr = av_fetch(invmap, 1, TRUE);
+ SV** const to_expansion_ptr = av_fetch(invmap, 2, TRUE);
+ NV max_expansion = SvNV(*to_expansion_ptr);
+ SV* from_invlist = *from_invlist_ptr;
+ SV* to_invmap_sv = *to_invmap_ptr;
+ UV* map = (UV *) SvPVX(to_invmap_sv);
+ UV previous_map = TR_OOB;
+ const bool squash = cBOOL(PL_op->op_private & OPpTRANS_SQUASH);
+ const bool delete_unfound = cBOOL(PL_op->op_private & OPpTRANS_DELETE);
+ bool inplace = ! cBOOL(PL_op->op_private & OPpTRANS_GROWS);
+ const UV* from_array = invlist_array(from_invlist);
+ UV final_map;
+ bool out_is_utf8 = SvUTF8(sv);
+ STRLEN s_len;
+
+ PERL_ARGS_ASSERT_DO_TRANS_INVMAP;
+
+ /* A third element in the array indicates that the replacement list was
+ * shorter than the search list, and this element contains the value to use
+ * for the items that don't correspond */
+ if (av_top_index(invmap) >= 3) {
+ SV** const final_map_ptr = av_fetch(invmap, 3, TRUE);
+ SV* const final_map_sv = *final_map_ptr;
+ final_map = SvUV(final_map_sv);
+ }
- s = (const U8*)SvPV_nomg_const(sv, len);
- if (!SvUTF8(sv)) {
- hibit = ! is_utf8_invariant_string(s, len);
- if (hibit) {
- start = s = bytes_to_utf8(s, &len);
- }
+ /* If there is something in the transliteration that could force the input
+ * to be changed to UTF-8, we don't know if we can do it in place, so
+ * assume cannot */
+ if (! out_is_utf8 && (PL_op->op_private & OPpTRANS_CAN_FORCE_UTF8)) {
+ inplace = FALSE;
+ if (max_expansion < 2) {
+ max_expansion = 2;
+ }
}
+
+ s = (U8*)SvPV_nomg(sv, len);
send = s + len;
+ s0 = s;
- while (s < send) {
- const UV uv = swash_fetch(rv, s, TRUE);
- if (uv < none || uv == extra)
- matches++;
- s += UTF8SKIP(s);
+ /* We know by now if there are some possible input strings whose
+ * transliterations are longer than the input. If none can, we just edit
+ * in place. */
+ if (inplace) {
+ d0 = d = s;
+ }
+ else {
+ /* Here, we can't edit in place. We have no idea how much, if any,
+ * this particular input string will grow. However, the compilation
+ * calculated the maximum expansion possible. Use that to allocale
+ * based on the worst case scenario. */
+ Newx(d, len * max_expansion + 1, U8);
+ d0 = d;
}
- if (hibit)
- Safefree(start);
- return matches;
-}
+ restart:
+ /* Do the actual transliteration */
+ while (s < send) {
+ UV from;
+ UV to;
+ SSize_t i;
+ STRLEN s_len;
+
+ /* Get the code point of the next character in the string */
+ if (! SvUTF8(sv) || UTF8_IS_INVARIANT(*s)) {
+ from = *s;
+ s_len = 1;
+ }
+ else {
+ from = utf8_to_uvchr_buf(s, send, &s_len);
+ if (from == 0 && *s != '\0') {
+ _force_out_malformed_utf8_message(s, send, 0, /*die*/TRUE);
+ }
+ }
-/* Helper function for do_trans().
- * Handles utf8 cases(*) involving the /c, /d, /s flags,
- * and where search and replacement charlists aren't identical.
- * (*) i.e. where the search or replacement charlists are utf8. sv may
- * or may not be utf8.
- */
+ /* Look the code point up in the data structure for this tr/// to get
+ * what it maps to */
+ i = _invlist_search(from_invlist, from);
+ assert(i >= 0);
-STATIC Size_t
-S_do_trans_complex_utf8(pTHX_ SV * const sv)
-{
- U8 *start, *send;
- U8 *d;
- Size_t matches = 0;
- const bool squash = cBOOL(PL_op->op_private & OPpTRANS_SQUASH);
- const bool del = cBOOL(PL_op->op_private & OPpTRANS_DELETE);
- const bool grows = cBOOL(PL_op->op_private & OPpTRANS_GROWS);
- SV* const rv =
-#ifdef USE_ITHREADS
- PAD_SVl(cPADOP->op_padix);
-#else
- MUTABLE_SV(cSVOP->op_sv);
-#endif
- HV * const hv = MUTABLE_HV(SvRV(rv));
- SV * const *svp = hv_fetchs(hv, "NONE", FALSE);
- const UV none = svp ? SvUV(*svp) : 0x7fffffff;
- const UV extra = none + 1;
- UV final = 0;
- bool havefinal = FALSE;
- STRLEN len;
- U8 *dstart, *dend;
- U8 hibit = 0;
- U8 *s = (U8*)SvPV_nomg(sv, len);
+ to = map[i];
- PERL_ARGS_ASSERT_DO_TRANS_COMPLEX_UTF8;
+ if (to == (UV) TR_UNLISTED) { /* Just copy the unreplaced character */
+ if (UVCHR_IS_INVARIANT(from) || ! out_is_utf8) {
+ *d++ = from;
+ }
+ else if (SvUTF8(sv)) {
+ Move(s, d, s_len, U8);
+ d += s_len;
+ }
+ else { /* Convert to UTF-8 */
+ append_utf8_from_native_byte(*s, &d);
+ }
- if (!SvUTF8(sv)) {
- hibit = ! is_utf8_invariant_string(s, len);
- if (hibit) {
- s = bytes_to_utf8(s, &len);
+ previous_map = to;
+ s += s_len;
+ continue;
}
- }
- send = s + len;
- start = s;
- svp = hv_fetchs(hv, "FINAL", FALSE);
- if (svp) {
- final = SvUV(*svp);
- havefinal = TRUE;
- }
+ /* Everything else is counted as a match */
+ matches++;
- if (grows) {
- /* d needs to be bigger than s, in case e.g. upgrading is required */
- Newx(d, len * 3 + UTF8_MAXBYTES, U8);
- dend = d + len * 3;
- dstart = d;
- }
- else {
- dstart = d = s;
- dend = d + len;
- }
+ if (to == (UV) TR_SPECIAL_HANDLING) {
+ if (delete_unfound) {
+ previous_map = to;
+ s += s_len;
+ continue;
+ }
- if (squash) {
- UV puv = 0xfeedface;
- while (s < send) {
- UV uv = swash_fetch(rv, s, TRUE);
-
- if (d > dend) {
- const STRLEN clen = d - dstart;
- const STRLEN nlen = dend - dstart + len + UTF8_MAXBYTES;
- if (!grows)
- Perl_croak(aTHX_ "panic: do_trans_complex_utf8 line %d",__LINE__);
- Renew(dstart, nlen + UTF8_MAXBYTES, U8);
- d = dstart + clen;
- dend = dstart + nlen;
- }
- if (uv < none) {
- matches++;
- s += UTF8SKIP(s);
- if (uv != puv) {
- d = uvchr_to_utf8(d, uv);
- puv = uv;
- }
- continue;
- }
- else if (uv == none) { /* "none" is unmapped character */
- const int i = UTF8SKIP(s);
- Move(s, d, i, U8);
- d += i;
- s += i;
- puv = 0xfeedface;
- continue;
- }
- else if (uv == extra && !del) {
- matches++;
- if (havefinal) {
- s += UTF8SKIP(s);
- if (puv != final) {
- d = uvchr_to_utf8(d, final);
- puv = final;
- }
- }
- else {
- STRLEN len;
- uv = utf8n_to_uvchr(s, send - s, &len, UTF8_ALLOW_DEFAULT);
- if (uv != puv) {
- Move(s, d, len, U8);
- d += len;
- puv = uv;
- }
- s += len;
- }
- continue;
- }
- matches++; /* "none+1" is delete character */
- s += UTF8SKIP(s);
- }
- }
- else {
- while (s < send) {
- const UV uv = swash_fetch(rv, s, TRUE);
- if (d > dend) {
- const STRLEN clen = d - dstart;
- const STRLEN nlen = dend - dstart + len + UTF8_MAXBYTES;
- if (!grows)
- Perl_croak(aTHX_ "panic: do_trans_complex_utf8 line %d",__LINE__);
- Renew(dstart, nlen + UTF8_MAXBYTES, U8);
- d = dstart + clen;
- dend = dstart + nlen;
- }
- if (uv < none) {
- matches++;
- s += UTF8SKIP(s);
- d = uvchr_to_utf8(d, uv);
- continue;
- }
- else if (uv == none) { /* "none" is unmapped character */
- const int i = UTF8SKIP(s);
- Move(s, d, i, U8);
- d += i;
- s += i;
- continue;
- }
- else if (uv == extra && !del) {
- matches++;
- s += UTF8SKIP(s);
- d = uvchr_to_utf8(d, final);
- continue;
- }
- matches++; /* "none+1" is delete character */
- s += UTF8SKIP(s);
- }
+ /* Use the final character in the replacement list */
+ to = final_map;
+ }
+ else { /* Here the input code point is to be remapped. The actual
+ value is offset from the base of this entry */
+ to += from - from_array[i];
+ }
+
+ /* If copying all occurrences, or this is the first occurrence, copy it
+ * to the output */
+ if (! squash || to != previous_map) {
+ if (out_is_utf8) {
+ d = uvchr_to_utf8(d, to);
+ }
+ else {
+ if (to >= 256) { /* If need to convert to UTF-8, restart */
+ out_is_utf8 = TRUE;
+ s = s0;
+ d = d0;
+ matches = 0;
+ goto restart;
+ }
+ *d++ = to;
+ }
+ }
+
+ previous_map = to;
+ s += s_len;
}
- if (grows || hibit) {
- sv_setpvn(sv, (char*)dstart, d - dstart);
- Safefree(dstart);
- if (grows && hibit)
- Safefree(start);
+
+ s_len = 0;
+ s += s_len;
+ if (! inplace) {
+ sv_setpvn(sv, (char*)d0, d - d0);
}
else {
*d = '\0';
- SvCUR_set(sv, d - dstart);
+ SvCUR_set(sv, d - d0);
+ }
+
+ if (! SvUTF8(sv) && out_is_utf8) {
+ SvUTF8_on(sv);
}
- SvUTF8_on(sv);
SvSETMAGIC(sv);
return matches;
@@ -627,7 +540,8 @@ S_do_trans_complex_utf8(pTHX_ SV * const sv)
/* Execute a tr//. sv is the value to be translated, while PL_op
* should be an OP_TRANS or OP_TRANSR op, whose op_pv field contains a
- * translation table or whose op_sv field contains a swash.
+ * translation table or whose op_sv field contains an inversion map.
+ *
* Returns a count of number of characters translated
*/
@@ -636,31 +550,49 @@ Perl_do_trans(pTHX_ SV *sv)
{
STRLEN len;
const U8 flags = PL_op->op_private;
- const U8 hasutf = flags & (OPpTRANS_FROM_UTF | OPpTRANS_TO_UTF);
+ bool use_utf8_fcns = cBOOL(flags & OPpTRANS_USE_SVOP);
+ bool identical = cBOOL(flags & OPpTRANS_IDENTICAL);
PERL_ARGS_ASSERT_DO_TRANS;
- if (SvREADONLY(sv) && !(flags & OPpTRANS_IDENTICAL)) {
+ if (SvREADONLY(sv) && ! identical) {
Perl_croak_no_modify();
}
(void)SvPV_const(sv, len);
if (!len)
return 0;
- if (!(flags & OPpTRANS_IDENTICAL)) {
+ if (! identical) {
if (!SvPOKp(sv) || SvTHINKFIRST(sv))
(void)SvPV_force_nomg(sv, len);
(void)SvPOK_only_UTF8(sv);
}
- /* If we use only OPpTRANS_IDENTICAL to bypass the READONLY check,
- * we must also rely on it to choose the readonly strategy.
- */
- if (flags & OPpTRANS_IDENTICAL) {
- return hasutf ? do_trans_count_utf8(sv) : do_trans_count(sv, (OPtrans_map*)cPVOP->op_pv);
- } else if (flags & (OPpTRANS_SQUASH|OPpTRANS_DELETE|OPpTRANS_COMPLEMENT)) {
- return hasutf ? do_trans_complex_utf8(sv) : do_trans_complex(sv, (OPtrans_map*)cPVOP->op_pv);
- } else {
- return hasutf ? do_trans_simple_utf8(sv) : do_trans_simple(sv, (OPtrans_map*)cPVOP->op_pv);
+ if (use_utf8_fcns) {
+ SV* const map =
+#ifdef USE_ITHREADS
+ PAD_SVl(cPADOP->op_padix);
+#else
+ MUTABLE_SV(cSVOP->op_sv);
+#endif
+
+ if (identical) {
+ return do_trans_count_invmap(sv, (AV *) map);
+ }
+ else {
+ return do_trans_invmap(sv, (AV *) map);
+ }
+ }
+ else {
+ const OPtrans_map * const map = (OPtrans_map*)cPVOP->op_pv;
+
+ if (identical) {
+ return do_trans_count(sv, map);
+ }
+ else if (flags & (OPpTRANS_SQUASH|OPpTRANS_DELETE|OPpTRANS_COMPLEMENT)) {
+ return do_trans_complex(sv, map);
+ }
+ else
+ return do_trans_simple(sv, map);
}
}