summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c150
1 files changed, 130 insertions, 20 deletions
diff --git a/utf8.c b/utf8.c
index 91e314261e..14d92d09df 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2161,44 +2161,154 @@ If C<e E<lt> s> or if the scan would end up past C<e>, it raises a UTF8 warning
and returns the number of valid characters.
=cut
+
+ For long strings we process the input word-at-a-time, and count
+ continuations, instead of otherwise counting characters and using UTF8SKIP
+ to find the next one. If our input were 13-byte characters, the per-word
+ would be a loser, as we would be doing things in 8 byte chunks (or 4 on a
+ 32-bit platform). But the maximum legal Unicode code point is 4 bytes, and
+ most text will have a significant number of 1 and 2 byte characters, so the
+ per-word is generally a winner.
+
+ There are start-up and finish costs with the per-word method, so we use the
+ standard method unless the input has a relatively large length.
*/
STRLEN
-Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
+Perl_utf8_length(pTHX_ const U8 * const s0, const U8 * const e)
{
+ STRLEN continuations = 0;
STRLEN len = 0;
+ const U8 * s = s0;
PERL_ARGS_ASSERT_UTF8_LENGTH;
- /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
- * the bitops (especially ~) can create illegal UTF-8.
- * In other words: in Perl UTF-8 is not just for Unicode. */
+ /* For EBCDCIC and short strings, we count the characters. The boundary
+ * was determined by eyeballing the output of Porting/bench.pl and
+ * choosing a number where the continuations method gave better results (on
+ * a 64 bit system, khw not having access to a 32 bit system with
+ * cachegrind). The number isn't critical, as at these sizes, the total
+ * time spent isn't large either way */
- while (s < e) {
- Ptrdiff_t expected_byte_count = UTF8SKIP(s);
+#ifndef EBCDIC
+
+ if (e - s0 < 96)
+
+#endif
+
+ {
+ while (s < e) { /* Count characters directly */
+
+ /* Take extra care to not exceed 'e' (which would be undefined
+ * behavior) should the input be malformed, with a partial
+ * character at the end */
+ Ptrdiff_t expected_byte_count = UTF8SKIP(s);
+ if (UNLIKELY(e - s < expected_byte_count)) {
+ goto warn_and_return;
+ }
+
+ len++;
+ s += expected_byte_count;
+ }
- if (UNLIKELY(e - s < expected_byte_count)) {
- goto warn_and_return;
+ if (LIKELY(e == s)) {
+ return len;
}
- len++;
- s += expected_byte_count;
+ warn_and_return:
+ if (ckWARN_d(WARN_UTF8)) {
+ if (PL_op)
+ Perl_warner(aTHX_ packWARN(WARN_UTF8),
+ "%s in %s", unees, OP_DESC(PL_op));
+ else
+ Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", unees);
+ }
+
+ return s - s0;
}
- if (LIKELY(e == s)) {
- return len;
+#ifndef EBCDIC
+
+ /* Count continuations, word-at-a-time.
+ *
+ * We need to stop before the final start character in order to
+ * preserve the limited error checking that's always been done */
+ const U8 * e_limit = e - UTF8_MAXBYTES;
+
+ /* Points to the first byte >=s which is positioned at a word boundary. If
+ * s is on a word boundary, it is s, otherwise it is to the next word. */
+ const U8 * partial_word_end = s + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
+ - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK);
+
+ /* Process up to a full word boundary. */
+ while (s < partial_word_end) {
+ const Size_t skip = UTF8SKIP(s);
+
+ continuations += skip - 1;
+ s += skip;
}
- /* Here, s > e on entry */
+ /* Adjust back down any overshoot */
+ continuations -= s - partial_word_end;
+ s = partial_word_end;
+
+ do { /* Process per-word */
+
+ /* The idea for counting continuation bytes came from
+ * http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html
+ * One thing it does that this doesn't is to prefetch the buffer
+ * __builtin_prefetch(&s[256], 0, 0);
+ *
+ * A continuation byte has the upper 2 bits be '10', and the rest
+ * dont-cares. The VARIANTS mask zeroes out all but the upper bit of
+ * each byte in the word. That gets shifted to the byte's lowest bit,
+ * and 'anded' with the complement of the 2nd highest bit of the byte,
+ * which has also been shifted to that position. Hence the bit in that
+ * position will be 1 iff the upper bit is 1 and the next one is 0. We
+ * then use the same integer multiplcation and shifting that are used
+ * in variant_under_utf8_count() to count how many of those are set in
+ * the word. */
+
+ continuations += (((((* (const PERL_UINTMAX_T *) s)
+ & PERL_VARIANTS_WORD_MASK) >> 7)
+ & (((~ (* (const PERL_UINTMAX_T *) s))) >> 6))
+ * PERL_COUNT_MULTIPLIER)
+ >> ((PERL_WORDSIZE - 1) * CHARBITS);
+ s += PERL_WORDSIZE;
+ } while (s + PERL_WORDSIZE <= e_limit);
+
+ /* Process remainder per-byte */
+ while (s < e) {
+ if (UTF8_IS_CONTINUATION(*s)) {
+ continuations++;
+ s++;
+ continue;
+ }
+
+ /* Here is a starter byte. Use UTF8SKIP from now on */
+ do {
+ Ptrdiff_t expected_byte_count = UTF8SKIP(s);
+ if (UNLIKELY(e - s < expected_byte_count)) {
+ break;
+ }
+
+ continuations += expected_byte_count- 1;
+ s += expected_byte_count;
+ } while (s < e);
+
+ break;
+ }
+
+# endif
+
+ if (LIKELY(e == s)) {
+ return s - s0 - continuations;
+ }
- warn_and_return:
- if (PL_op)
- Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
- "%s in %s", unees, OP_DESC(PL_op));
- else
- Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
+ /* Convert to characters */
+ s -= continuations;
- return len;
+ goto warn_and_return;
}
/*