summaryrefslogtreecommitdiff
path: root/proto.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2019-11-04 21:30:48 -0700
committerKarl Williamson <khw@cpan.org>2019-11-06 21:22:24 -0700
commitf34acfecc286f2eff2450db713da005d888a7317 (patch)
tree48c7a2dcb411f0ed37216761f759365c82c6f243 /proto.h
parent8c90d3a9c79a9471ef12dde584263fc38571cf46 (diff)
downloadperl-f34acfecc286f2eff2450db713da005d888a7317.tar.gz
Reimplement tr/// without swashes
This large commit removes the last use of swashes from core. It replaces swashes by inversion maps. This data structure is already in use for some Unicode properties, such as case changing. The inversion map data structure leads to straight forward implementation code, so I collapsed the two doop.c routines do_trans_complex_utf8() and do_trans_simple_utf8() into one. A few conditionals could be avoided in the loop if this function were split so that one version didn't have to test for, e.g., squashing, but I suspect these are in the noise in the loop, which has to deal with UTF-8 conversions. This should be faster than the previous implementation anyway. I measured the differences some releases back, and inversion maps were faster than the equivalent swash for up to 512 or 1024 different ranges. These numbers are unlikely to be exceeded in tr/// except possibly in machine-generated ones. Inversion maps are capable of handling both UTF-8 and non-UTF-8 cases, but I left in the existing non-UTF-8 implementation, which uses tables, because I suspect it is faster. This means that there is extra code, purely for runtime performance. An inversion map is always created from the input, and then if the table implementation is to be used, the table is easily derived from the map. Prior to this commit, the table implementation was used in certain edge cases involving code points above 255. Those cases are now handled by the inversion map implementation, because it would have taken extra code to detect them, and I didn't think it was worth it. That could be changed if I am wrong. Creating an inversion map for all inputs essentially normalizes them, and then the same logic is usable for all. This fixes some false negatives in the previous implementation. It also allows for detecting if the actual transliteration can be done in place. Previously, the code mostly punted on that detection for the UTF-8 case. This also allows for accurate counting of the lengths of the two sides, fixing some longstanding TODO warning tests. A new flag is created, OPpTRANS_CAN_FORCE_UTF8, when the tr/// has a below 256 character resolving to one that requires UTF-8. If this isn't set, the code knows that a non-UTF-8 input won't become UTF-8 in the process, and so can take short cuts. The bit representing this flag is the same as OPpTRANS_FROM_UTF, which is no longer used. That name is left in so that the dozen-ish modules in cpan that refer to it can still compile. AFAICT none of them actually use the flag, as well they shouldn't since it is private to the core. Inversion maps are ideally suited for tr/// implementations. An issue with them in general is that for some pathological data, they can become fragmented requiring more space than you would expect, to represent the underlying data. However, the typical tr/// would not have this issue, requiring only very short inversion maps to represent; in some cases shorter than the table implementation. Inversion maps are also easier to deparse than swashes. A deparse TODO was also fixed by this commit, and the code to deparse UTF-8 inputs is simplified. One could implement specialized data structures for specific types of inputs. For example, a common tr/// form is a single range, like tr/A-Z/a-z/. That could be implemented without a table and be quite fast. An intermediate step would be to use the inversion map implementation always when the transliteration is a single range, and then special case length=1 maps at execution time. Thanks to Nicholas Rochemagne for his help on B
Diffstat (limited to 'proto.h')
-rw-r--r--proto.h119
1 files changed, 58 insertions, 61 deletions
diff --git a/proto.h b/proto.h
index 20e4b0e511..51c316e070 100644
--- a/proto.h
+++ b/proto.h
@@ -4859,31 +4859,26 @@ STATIC Size_t S_do_trans_complex(pTHX_ SV * const sv, const OPtrans_map * const
#define PERL_ARGS_ASSERT_DO_TRANS_COMPLEX \
assert(sv); assert(tbl)
-STATIC Size_t S_do_trans_complex_utf8(pTHX_ SV * const sv)
- __attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_DO_TRANS_COMPLEX_UTF8 \
- assert(sv)
-
STATIC Size_t S_do_trans_count(pTHX_ SV * const sv, const OPtrans_map * const tbl)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_DO_TRANS_COUNT \
assert(sv); assert(tbl)
-STATIC Size_t S_do_trans_count_utf8(pTHX_ SV * const sv)
+STATIC Size_t S_do_trans_count_invmap(pTHX_ SV * const sv, AV * const map)
__attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_DO_TRANS_COUNT_UTF8 \
- assert(sv)
+#define PERL_ARGS_ASSERT_DO_TRANS_COUNT_INVMAP \
+ assert(sv); assert(map)
+
+STATIC Size_t S_do_trans_invmap(pTHX_ SV * const sv, AV * const map)
+ __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_DO_TRANS_INVMAP \
+ assert(sv); assert(map)
STATIC Size_t S_do_trans_simple(pTHX_ SV * const sv, const OPtrans_map * const tbl)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_DO_TRANS_SIMPLE \
assert(sv); assert(tbl)
-STATIC Size_t S_do_trans_simple_utf8(pTHX_ SV * const sv)
- __attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_DO_TRANS_SIMPLE_UTF8 \
- assert(sv)
-
#endif
#if defined(PERL_IN_DUMP_C)
STATIC CV* S_deb_curcv(pTHX_ I32 ix);
@@ -5540,12 +5535,6 @@ STATIC SV * S_space_join_names_mortal(pTHX_ char *const *array);
STATIC void S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist);
#define PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS \
assert(pRExC_state); assert(invlist)
-#ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE SV* S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp)
- __attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_ADD_CP_TO_INVLIST
-#endif
-
STATIC U32 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_ADD_DATA \
@@ -5582,13 +5571,6 @@ STATIC SV * S_get_ANYOFM_contents(pTHX_ const regnode * n)
STATIC SV* S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, const regnode_charclass* const node);
#define PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC \
assert(pRExC_state); assert(node)
-#ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE STRLEN* S_get_invlist_iter_addr(SV* invlist)
- __attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR \
- assert(invlist)
-#endif
-
STATIC bool S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode_offset* nodep, UV *code_point_p, int* cp_count, I32 *flagp, const bool strict, const U32 depth);
#define PERL_ARGS_ASSERT_GROK_BSLASH_N \
assert(pRExC_state); assert(flagp)
@@ -5614,46 +5596,12 @@ PERL_STATIC_INLINE SV* S_invlist_contents(pTHX_ SV* const invlist, const bool tr
#endif
#ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE void S_invlist_extend(pTHX_ SV* const invlist, const UV len);
-#define PERL_ARGS_ASSERT_INVLIST_EXTEND \
- assert(invlist)
-#endif
-#ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE UV S_invlist_highest(SV* const invlist)
- __attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_INVLIST_HIGHEST \
- assert(invlist)
-#endif
-
-#ifndef PERL_NO_INLINE_FUNCTIONS
PERL_STATIC_INLINE bool S_invlist_is_iterating(SV* const invlist)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_INVLIST_IS_ITERATING \
assert(invlist)
#endif
-#ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE void S_invlist_iterfinish(SV* invlist);
-#define PERL_ARGS_ASSERT_INVLIST_ITERFINISH \
- assert(invlist)
-#endif
-#ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE void S_invlist_iterinit(SV* invlist);
-#define PERL_ARGS_ASSERT_INVLIST_ITERINIT \
- assert(invlist)
-#endif
-#ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE bool S_invlist_iternext(SV* invlist, UV* start, UV* end)
- __attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_INVLIST_ITERNEXT \
- assert(invlist); assert(start); assert(end)
-#endif
-
-#ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE void S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset);
-#define PERL_ARGS_ASSERT_INVLIST_SET_LEN \
- assert(invlist)
-#endif
STATIC bool S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc);
#define PERL_ARGS_ASSERT_IS_SSC_WORTH_IT \
assert(pRExC_state); assert(ssc)
@@ -5811,6 +5759,55 @@ PERL_CALLCONV void Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char*
#define PERL_ARGS_ASSERT__INVLIST_DUMP \
assert(file); assert(indent); assert(invlist)
#endif
+#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C)
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE SV* S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp)
+ __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_ADD_CP_TO_INVLIST
+#endif
+
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE STRLEN* S_get_invlist_iter_addr(SV* invlist)
+ __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR \
+ assert(invlist)
+#endif
+
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE void S_invlist_extend(pTHX_ SV* const invlist, const UV len);
+#define PERL_ARGS_ASSERT_INVLIST_EXTEND \
+ assert(invlist)
+#endif
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE UV S_invlist_highest(SV* const invlist)
+ __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_INVLIST_HIGHEST \
+ assert(invlist)
+#endif
+
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE void S_invlist_iterfinish(SV* invlist);
+#define PERL_ARGS_ASSERT_INVLIST_ITERFINISH \
+ assert(invlist)
+#endif
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE void S_invlist_iterinit(SV* invlist);
+#define PERL_ARGS_ASSERT_INVLIST_ITERINIT \
+ assert(invlist)
+#endif
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE bool S_invlist_iternext(SV* invlist, UV* start, UV* end)
+ __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_INVLIST_ITERNEXT \
+ assert(invlist); assert(start); assert(end)
+#endif
+
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE void S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset);
+#define PERL_ARGS_ASSERT_INVLIST_SET_LEN \
+ assert(invlist)
+#endif
+#endif
#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_PERL_C) || defined(PERL_IN_UTF8_C)
PERL_CALLCONV bool Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b);
#define PERL_ARGS_ASSERT__INVLISTEQ \
@@ -5832,7 +5829,7 @@ PERL_CALLCONV void Perl_regprop(pTHX_ const regexp *prog, SV* sv, const regnode*
#define PERL_ARGS_ASSERT_REGPROP \
assert(sv); assert(o)
#endif
-#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C)
+#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C)
#ifndef PERL_NO_INLINE_FUNCTIONS
PERL_STATIC_INLINE bool S__invlist_contains_cp(SV* const invlist, const UV cp)
__attribute__warn_unused_result__;