summaryrefslogtreecommitdiff
path: root/inline.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2021-05-13 08:50:23 -0600
committerKarl Williamson <khw@cpan.org>2022-02-01 20:33:48 -0700
commita460925186154b270b7a647a4f30b2f01fd97c4b (patch)
tree365c283a6141fb6a8b27db035ba6473b36447895 /inline.h
parentc0e63b1341c14ebd84a23fdbba9759d3fd686498 (diff)
downloadperl-a460925186154b270b7a647a4f30b2f01fd97c4b.tar.gz
Refactor utf8 to code point conversion
Most such conversions occur in the inlined function Perl_utf8n_to_uvchr_msgs(), which several macros like utf8n_to_uvchr() expand to. This commit effectively removes a conditional from inside the loop, and avoids some conditionals when converting the common case of the input being UTF-8 invariant (ASCII on ASCII platforms). Prior to this commit, the code did something different the first time through the loop than the other times. By hoisting that to pre-loop initialization, that conditional is removed from each iteration. That meant rearranging the loop to be a while(1), and have its exit conditions in the middle. All calls to this function from the Perl core pass in a non-empty string. But outside calls could conceivably pass an empty one which could lead to reading outside the buffer. An extra check is added to non-core calls, as is already done elsewhere. This change means that calls from core execute no more conditionals than the typical: if (UTF8_IS_INVARIANT(*s)) { code_point = *s; } else { code_point = utf8n_to_uvchr(s, ...) } I'm therefore thinking these can now just be replaced by the simpler code_point = utf8n_to_uvchr(s, ...) without a noticeable hit in performance. The essential difference is that the former gets its code point from the string already being examined, and the latter looks up data in a 450 byte static array that is referred to constantly, so is likely to be cached.
Diffstat (limited to 'inline.h')
-rw-r--r--inline.h64
1 files changed, 41 insertions, 23 deletions
diff --git a/inline.h b/inline.h
index b920cdf0f8..7ff7d799a2 100644
--- a/inline.h
+++ b/inline.h
@@ -2457,8 +2457,8 @@ Perl_utf8n_to_uvchr_msgs(const U8 *s,
const U8 * const s0 = s;
const U8 * send = s0 + curlen;
- UV uv = 0; /* The 0 silences some stupid compilers */
- UV state = 0;
+ UV type;
+ UV uv;
PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
@@ -2467,34 +2467,52 @@ Perl_utf8n_to_uvchr_msgs(const U8 *s,
* Otherwise we call a helper function to figure out the more complicated
* cases. */
- while (s < send && LIKELY(state != 1)) {
- UV type = PL_strict_utf8_dfa_tab[*s];
+ /* No calls from core pass in an empty string; non-core need a check */
+ PERL_NON_CORE_CHECK_EMPTY(s, send);
- uv = (state == 0)
- ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
- : UTF8_ACCUMULATE(uv, *s);
- state = PL_strict_utf8_dfa_tab[256 + state + type];
+ type = PL_strict_utf8_dfa_tab[*s];
- if (state != 0) {
- s++;
- continue;
- }
+ /* The table is structured so that 'type' is 0 iff the input byte is
+ * represented identically regardless of the UTF-8ness of the string */
+ if (type == 0) { /* UTF-8 invariants are returned unchanged */
+ uv = *s;
+ }
+ else {
+ UV state = PL_strict_utf8_dfa_tab[256 + type];
+ uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s);
- if (retlen) {
- *retlen = s - s0 + 1;
- }
- if (errors) {
- *errors = 0;
- }
- if (msgs) {
- *msgs = NULL;
+ while (++s < send) {
+ type = PL_strict_utf8_dfa_tab[*s];
+ state = PL_strict_utf8_dfa_tab[256 + state + type];
+
+ uv = UTF8_ACCUMULATE(uv, *s);
+
+ if (state == 0) {
+ goto success;
+ }
+
+ if (UNLIKELY(state == 1)) {
+ break;
+ }
}
- return UNI_TO_NATIVE(uv);
+ /* Here is potentially problematic. Use the full mechanism */
+ return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags,
+ errors, msgs);
+ }
+
+ success:
+ if (retlen) {
+ *retlen = s - s0 + 1;
+ }
+ if (errors) {
+ *errors = 0;
+ }
+ if (msgs) {
+ *msgs = NULL;
}
- /* Here is potentially problematic. Use the full mechanism */
- return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs);
+ return UNI_TO_NATIVE(uv);
}
PERL_STATIC_INLINE UV