diff options
author | Karl Williamson <khw@cpan.org> | 2021-06-05 16:31:11 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2021-08-07 05:14:44 -0600 |
commit | e932a5edb50acc7eadabb8b398bd071556b21ded (patch) | |
tree | 8066dac4c06c7be19f0d4acf80e6a4bcd93a8921 /utf8.c | |
parent | 3e3eb1f61aa7ee77634882c846122860a4c8ef42 (diff) | |
download | perl-e932a5edb50acc7eadabb8b398bd071556b21ded.tar.gz |
Refactor uvoffuni_to_utf8_flags_msgs
Having a fast UVOFFUNISKIP() allows this function be be refactored to
simplify it.
This commit continues to shortchange large code points and EBCDIC by a
little. For example, it checks if a 4-byte character is above Unicode,
but no 4-byte characters fit that description in UTF-EBCDIC. This will
be fixed in the next commit, which will prepare for further
enhancements.
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 192 |
1 files changed, 73 insertions, 119 deletions
@@ -262,149 +262,103 @@ The caller, of course, is responsible for freeing any returned HV. U8 * Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV input_uv, const UV flags, HV** msgs) { + U8 *p; + UV shifted_uv = input_uv; + STRLEN utf8_skip = OFFUNISKIP(input_uv); + PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS; if (msgs) { *msgs = NULL; } - if (OFFUNI_IS_INVARIANT(input_uv)) { + switch (utf8_skip) { + case 1: *d++ = LATIN1_TO_NATIVE(input_uv); return d; - } - if (input_uv <= MAX_UTF8_TWO_BYTE) { - *d++ = I8_TO_NATIVE_UTF8(( input_uv >> SHIFT) | UTF_START_MARK(2)); - *d++ = I8_TO_NATIVE_UTF8(( input_uv & MASK) | MARK); - return d; - } + default: + if ( UNLIKELY(input_uv > MAX_LEGAL_CP + && UNLIKELY(! (flags & UNICODE_ALLOW_ABOVE_IV_MAX)))) + { + Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, input_uv)); + } - /* Not 2-byte; test for and handle 3-byte result. In the test immediately - * below, the 16 is for start bytes E0-EF (which are all the possible ones - * for 3 byte characters). The 2 is for 2 continuation bytes; these each - * contribute SHIFT bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000 - * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC; - * 0x800-0xFFFF on ASCII */ - if (input_uv < (16 * (1U << (2 * SHIFT)))) { - *d++ = I8_TO_NATIVE_UTF8(( input_uv >> ((3 - 1) * SHIFT)) | UTF_START_MARK(3)); - *d++ = I8_TO_NATIVE_UTF8(((input_uv >> ((2 - 1) * SHIFT)) & MASK) | MARK); - *d++ = I8_TO_NATIVE_UTF8(( input_uv /* (1 - 1) */ & MASK) | MARK); - -#ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC, so - aren't tested here */ - /* The most likely code points in this range are below the surrogates. - * Do an extra test to quickly exclude those. */ - if (UNLIKELY(input_uv >= UNICODE_SURROGATE_FIRST)) { - if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(input_uv) - || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(input_uv))) - { - HANDLE_UNICODE_NONCHAR(input_uv, flags, msgs); - } - else if (UNLIKELY(UNICODE_IS_SURROGATE(input_uv))) { - HANDLE_UNICODE_SURROGATE(input_uv, flags, msgs); - } + p = d + utf8_skip - 1; + while (p >= d + 4) { + *p-- = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); + shifted_uv >>= SHIFT; } -#endif - return d; - } - /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII - * platforms, and 0x4000 on EBCDIC. There are problematic cases that can - * happen starting with 4-byte characters on ASCII platforms. We unify the - * code for these with EBCDIC, even though some of them require 5-bytes on - * those, because khw believes the code saving is worth the very slight - * performance hit on these high EBCDIC code points. */ + /* FALLTHROUGH */ - if (UNLIKELY(UNICODE_IS_SUPER(input_uv))) { - if (UNLIKELY( input_uv > MAX_LEGAL_CP - && ! (flags & UNICODE_ALLOW_ABOVE_IV_MAX))) - { - Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, input_uv)); - } - if ( (flags & UNICODE_WARN_SUPER) - || ( (flags & UNICODE_WARN_PERL_EXTENDED) - && UNICODE_IS_PERL_EXTENDED(input_uv))) - { - const char * format = super_cp_format; - U32 category = packWARN(WARN_NON_UNICODE); - U32 flag = UNICODE_GOT_SUPER; - - /* Choose the more dire applicable warning */ - if (UNICODE_IS_PERL_EXTENDED(input_uv)) { - format = PL_extended_cp_format; - category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE); - if (flags & (UNICODE_WARN_PERL_EXTENDED - |UNICODE_DISALLOW_PERL_EXTENDED)) - { - flag = UNICODE_GOT_PERL_EXTENDED; + case 4: + if (UNLIKELY(UNICODE_IS_SUPER(input_uv))) { + if ( (flags & UNICODE_WARN_SUPER) + || ( (flags & UNICODE_WARN_PERL_EXTENDED) + && UNICODE_IS_PERL_EXTENDED(input_uv))) + { + const char * format = super_cp_format; + U32 category = packWARN(WARN_NON_UNICODE); + U32 flag = UNICODE_GOT_SUPER; + + /* Choose the more dire applicable warning */ + if (UNICODE_IS_PERL_EXTENDED(input_uv)) { + format = PL_extended_cp_format; + category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE); + if (flags & (UNICODE_WARN_PERL_EXTENDED + |UNICODE_DISALLOW_PERL_EXTENDED)) + { + flag = UNICODE_GOT_PERL_EXTENDED; + } } - } - if (msgs) { - *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv), - category, flag); + if (msgs) { + *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv), + category, flag); + } + else if ( ckWARN_d(WARN_NON_UNICODE) + || ( (flag & UNICODE_GOT_PERL_EXTENDED) + && ckWARN(WARN_PORTABLE))) + { + Perl_warner(aTHX_ category, format, input_uv); + } } - else if ( ckWARN_d(WARN_NON_UNICODE) - || ( (flag & UNICODE_GOT_PERL_EXTENDED) - && ckWARN(WARN_PORTABLE))) + if ( (flags & UNICODE_DISALLOW_SUPER) + || ( (flags & UNICODE_DISALLOW_PERL_EXTENDED) + && UNICODE_IS_PERL_EXTENDED(input_uv))) { - Perl_warner(aTHX_ category, format, input_uv); + return NULL; } } - if ( (flags & UNICODE_DISALLOW_SUPER) - || ( (flags & UNICODE_DISALLOW_PERL_EXTENDED) - && UNICODE_IS_PERL_EXTENDED(input_uv))) - { - return NULL; - } - } - else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(input_uv))) { - HANDLE_UNICODE_NONCHAR(input_uv, flags, msgs); - } - /* Test for and handle 4-byte result. In the test immediately below, the - * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte - * characters). The 3 is for 3 continuation bytes; these each contribute - * SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on - * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC; - * 0x1_0000-0x1F_FFFF on ASCII */ - if (input_uv < (8 * (1U << (3 * SHIFT)))) { - *d++ = I8_TO_NATIVE_UTF8(( input_uv >> ((4 - 1) * SHIFT)) | UTF_START_MARK(4)); - *d++ = I8_TO_NATIVE_UTF8(((input_uv >> ((3 - 1) * SHIFT)) & MASK) | MARK); - *d++ = I8_TO_NATIVE_UTF8(((input_uv >> ((2 - 1) * SHIFT)) & MASK) | MARK); - *d++ = I8_TO_NATIVE_UTF8(( input_uv /* (1 - 1) */ & MASK) | MARK); - -#ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte - characters. The end-plane non-characters for EBCDIC were - handled just above */ - if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(input_uv))) { - HANDLE_UNICODE_NONCHAR(input_uv, flags, msgs); - } - else if (UNLIKELY(UNICODE_IS_SURROGATE(input_uv))) { - HANDLE_UNICODE_SURROGATE(input_uv, flags, msgs); - } -#endif + d[3] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); + shifted_uv >>= SHIFT; + /* FALLTHROUGH */ - return d; - } + case 3: + if (input_uv >= UNICODE_SURROGATE_FIRST) { + if (UNLIKELY(UNICODE_IS_NONCHAR(input_uv))) { + HANDLE_UNICODE_NONCHAR(input_uv, flags, msgs); + } + else if (UNLIKELY(UNICODE_IS_SURROGATE(input_uv))) { + HANDLE_UNICODE_SURROGATE(input_uv, flags, msgs); + } + } - /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII - * platforms, and 0x4000 on EBCDIC. At this point we switch to a loop - * format. The unrolled version above turns out to not save all that much - * time, and at these high code points (well above the legal Unicode range - * on ASCII platforms, and well above anything in common use in EBCDIC), - * khw believes that less code outweighs slight performance gains. */ + d[2] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); + shifted_uv >>= SHIFT; + /* FALLTHROUGH */ - { - STRLEN len = OFFUNISKIP(input_uv); - U8 *p = d+len-1; - while (p > d) { - *p-- = I8_TO_NATIVE_UTF8((input_uv & MASK) | MARK); - input_uv >>= SHIFT; - } - *p = I8_TO_NATIVE_UTF8((input_uv & UTF_START_MASK(len)) | UTF_START_MARK(len)); - return d+len; + case 2: + d[1] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); + shifted_uv >>= SHIFT; + d[0] = I8_TO_NATIVE_UTF8((shifted_uv & UTF_START_MASK(utf8_skip)) + | UTF_START_MARK(utf8_skip)); + break; } + + return d + utf8_skip; } /* |