summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2021-06-05 16:31:11 -0600
committerKarl Williamson <khw@cpan.org>2021-08-07 05:14:44 -0600
commite932a5edb50acc7eadabb8b398bd071556b21ded (patch)
tree8066dac4c06c7be19f0d4acf80e6a4bcd93a8921 /utf8.c
parent3e3eb1f61aa7ee77634882c846122860a4c8ef42 (diff)
downloadperl-e932a5edb50acc7eadabb8b398bd071556b21ded.tar.gz
Refactor uvoffuni_to_utf8_flags_msgs
Having a fast UVOFFUNISKIP() allows this function be be refactored to simplify it. This commit continues to shortchange large code points and EBCDIC by a little. For example, it checks if a 4-byte character is above Unicode, but no 4-byte characters fit that description in UTF-EBCDIC. This will be fixed in the next commit, which will prepare for further enhancements.
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c192
1 files changed, 73 insertions, 119 deletions
diff --git a/utf8.c b/utf8.c
index 7e4d95b92e..cd9f4481e6 100644
--- a/utf8.c
+++ b/utf8.c
@@ -262,149 +262,103 @@ The caller, of course, is responsible for freeing any returned HV.
U8 *
Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV input_uv, const UV flags, HV** msgs)
{
+ U8 *p;
+ UV shifted_uv = input_uv;
+ STRLEN utf8_skip = OFFUNISKIP(input_uv);
+
PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS;
if (msgs) {
*msgs = NULL;
}
- if (OFFUNI_IS_INVARIANT(input_uv)) {
+ switch (utf8_skip) {
+ case 1:
*d++ = LATIN1_TO_NATIVE(input_uv);
return d;
- }
- if (input_uv <= MAX_UTF8_TWO_BYTE) {
- *d++ = I8_TO_NATIVE_UTF8(( input_uv >> SHIFT) | UTF_START_MARK(2));
- *d++ = I8_TO_NATIVE_UTF8(( input_uv & MASK) | MARK);
- return d;
- }
+ default:
+ if ( UNLIKELY(input_uv > MAX_LEGAL_CP
+ && UNLIKELY(! (flags & UNICODE_ALLOW_ABOVE_IV_MAX))))
+ {
+ Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, input_uv));
+ }
- /* Not 2-byte; test for and handle 3-byte result. In the test immediately
- * below, the 16 is for start bytes E0-EF (which are all the possible ones
- * for 3 byte characters). The 2 is for 2 continuation bytes; these each
- * contribute SHIFT bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000
- * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
- * 0x800-0xFFFF on ASCII */
- if (input_uv < (16 * (1U << (2 * SHIFT)))) {
- *d++ = I8_TO_NATIVE_UTF8(( input_uv >> ((3 - 1) * SHIFT)) | UTF_START_MARK(3));
- *d++ = I8_TO_NATIVE_UTF8(((input_uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
- *d++ = I8_TO_NATIVE_UTF8(( input_uv /* (1 - 1) */ & MASK) | MARK);
-
-#ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC, so
- aren't tested here */
- /* The most likely code points in this range are below the surrogates.
- * Do an extra test to quickly exclude those. */
- if (UNLIKELY(input_uv >= UNICODE_SURROGATE_FIRST)) {
- if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(input_uv)
- || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(input_uv)))
- {
- HANDLE_UNICODE_NONCHAR(input_uv, flags, msgs);
- }
- else if (UNLIKELY(UNICODE_IS_SURROGATE(input_uv))) {
- HANDLE_UNICODE_SURROGATE(input_uv, flags, msgs);
- }
+ p = d + utf8_skip - 1;
+ while (p >= d + 4) {
+ *p-- = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
+ shifted_uv >>= SHIFT;
}
-#endif
- return d;
- }
- /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
- * platforms, and 0x4000 on EBCDIC. There are problematic cases that can
- * happen starting with 4-byte characters on ASCII platforms. We unify the
- * code for these with EBCDIC, even though some of them require 5-bytes on
- * those, because khw believes the code saving is worth the very slight
- * performance hit on these high EBCDIC code points. */
+ /* FALLTHROUGH */
- if (UNLIKELY(UNICODE_IS_SUPER(input_uv))) {
- if (UNLIKELY( input_uv > MAX_LEGAL_CP
- && ! (flags & UNICODE_ALLOW_ABOVE_IV_MAX)))
- {
- Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, input_uv));
- }
- if ( (flags & UNICODE_WARN_SUPER)
- || ( (flags & UNICODE_WARN_PERL_EXTENDED)
- && UNICODE_IS_PERL_EXTENDED(input_uv)))
- {
- const char * format = super_cp_format;
- U32 category = packWARN(WARN_NON_UNICODE);
- U32 flag = UNICODE_GOT_SUPER;
-
- /* Choose the more dire applicable warning */
- if (UNICODE_IS_PERL_EXTENDED(input_uv)) {
- format = PL_extended_cp_format;
- category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
- if (flags & (UNICODE_WARN_PERL_EXTENDED
- |UNICODE_DISALLOW_PERL_EXTENDED))
- {
- flag = UNICODE_GOT_PERL_EXTENDED;
+ case 4:
+ if (UNLIKELY(UNICODE_IS_SUPER(input_uv))) {
+ if ( (flags & UNICODE_WARN_SUPER)
+ || ( (flags & UNICODE_WARN_PERL_EXTENDED)
+ && UNICODE_IS_PERL_EXTENDED(input_uv)))
+ {
+ const char * format = super_cp_format;
+ U32 category = packWARN(WARN_NON_UNICODE);
+ U32 flag = UNICODE_GOT_SUPER;
+
+ /* Choose the more dire applicable warning */
+ if (UNICODE_IS_PERL_EXTENDED(input_uv)) {
+ format = PL_extended_cp_format;
+ category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
+ if (flags & (UNICODE_WARN_PERL_EXTENDED
+ |UNICODE_DISALLOW_PERL_EXTENDED))
+ {
+ flag = UNICODE_GOT_PERL_EXTENDED;
+ }
}
- }
- if (msgs) {
- *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
- category, flag);
+ if (msgs) {
+ *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
+ category, flag);
+ }
+ else if ( ckWARN_d(WARN_NON_UNICODE)
+ || ( (flag & UNICODE_GOT_PERL_EXTENDED)
+ && ckWARN(WARN_PORTABLE)))
+ {
+ Perl_warner(aTHX_ category, format, input_uv);
+ }
}
- else if ( ckWARN_d(WARN_NON_UNICODE)
- || ( (flag & UNICODE_GOT_PERL_EXTENDED)
- && ckWARN(WARN_PORTABLE)))
+ if ( (flags & UNICODE_DISALLOW_SUPER)
+ || ( (flags & UNICODE_DISALLOW_PERL_EXTENDED)
+ && UNICODE_IS_PERL_EXTENDED(input_uv)))
{
- Perl_warner(aTHX_ category, format, input_uv);
+ return NULL;
}
}
- if ( (flags & UNICODE_DISALLOW_SUPER)
- || ( (flags & UNICODE_DISALLOW_PERL_EXTENDED)
- && UNICODE_IS_PERL_EXTENDED(input_uv)))
- {
- return NULL;
- }
- }
- else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(input_uv))) {
- HANDLE_UNICODE_NONCHAR(input_uv, flags, msgs);
- }
- /* Test for and handle 4-byte result. In the test immediately below, the
- * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
- * characters). The 3 is for 3 continuation bytes; these each contribute
- * SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
- * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
- * 0x1_0000-0x1F_FFFF on ASCII */
- if (input_uv < (8 * (1U << (3 * SHIFT)))) {
- *d++ = I8_TO_NATIVE_UTF8(( input_uv >> ((4 - 1) * SHIFT)) | UTF_START_MARK(4));
- *d++ = I8_TO_NATIVE_UTF8(((input_uv >> ((3 - 1) * SHIFT)) & MASK) | MARK);
- *d++ = I8_TO_NATIVE_UTF8(((input_uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
- *d++ = I8_TO_NATIVE_UTF8(( input_uv /* (1 - 1) */ & MASK) | MARK);
-
-#ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte
- characters. The end-plane non-characters for EBCDIC were
- handled just above */
- if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(input_uv))) {
- HANDLE_UNICODE_NONCHAR(input_uv, flags, msgs);
- }
- else if (UNLIKELY(UNICODE_IS_SURROGATE(input_uv))) {
- HANDLE_UNICODE_SURROGATE(input_uv, flags, msgs);
- }
-#endif
+ d[3] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
+ shifted_uv >>= SHIFT;
+ /* FALLTHROUGH */
- return d;
- }
+ case 3:
+ if (input_uv >= UNICODE_SURROGATE_FIRST) {
+ if (UNLIKELY(UNICODE_IS_NONCHAR(input_uv))) {
+ HANDLE_UNICODE_NONCHAR(input_uv, flags, msgs);
+ }
+ else if (UNLIKELY(UNICODE_IS_SURROGATE(input_uv))) {
+ HANDLE_UNICODE_SURROGATE(input_uv, flags, msgs);
+ }
+ }
- /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
- * platforms, and 0x4000 on EBCDIC. At this point we switch to a loop
- * format. The unrolled version above turns out to not save all that much
- * time, and at these high code points (well above the legal Unicode range
- * on ASCII platforms, and well above anything in common use in EBCDIC),
- * khw believes that less code outweighs slight performance gains. */
+ d[2] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
+ shifted_uv >>= SHIFT;
+ /* FALLTHROUGH */
- {
- STRLEN len = OFFUNISKIP(input_uv);
- U8 *p = d+len-1;
- while (p > d) {
- *p-- = I8_TO_NATIVE_UTF8((input_uv & MASK) | MARK);
- input_uv >>= SHIFT;
- }
- *p = I8_TO_NATIVE_UTF8((input_uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
- return d+len;
+ case 2:
+ d[1] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
+ shifted_uv >>= SHIFT;
+ d[0] = I8_TO_NATIVE_UTF8((shifted_uv & UTF_START_MASK(utf8_skip))
+ | UTF_START_MARK(utf8_skip));
+ break;
}
+
+ return d + utf8_skip;
}
/*