diff options
-rw-r--r-- | handy.h | 4 | ||||
-rw-r--r-- | hv.c | 4 | ||||
-rw-r--r-- | pp.c | 2 | ||||
-rw-r--r-- | regcomp.c | 2 | ||||
-rw-r--r-- | regexec.c | 16 | ||||
-rw-r--r-- | toke.c | 6 | ||||
-rw-r--r-- | utf8.c | 24 | ||||
-rw-r--r-- | utf8.h | 15 |
8 files changed, 44 insertions, 29 deletions
@@ -1579,7 +1579,7 @@ END_EXTERN_C ? _generic_isCC(*(p), classnum) \ : (UTF8_IS_DOWNGRADEABLE_START(*(p))) \ ? _generic_isCC( \ - TWO_BYTE_UTF8_TO_NATIVE(*(p), \ + EIGHT_BIT_UTF8_TO_NATIVE(*(p), \ *((p)+1 )), \ classnum) \ : utf8) @@ -1665,7 +1665,7 @@ END_EXTERN_C (UTF8_IS_INVARIANT(*(p)) \ ? macro(*(p)) \ : (UTF8_IS_DOWNGRADEABLE_START(*(p))) \ - ? macro(TWO_BYTE_UTF8_TO_NATIVE(*(p), *((p)+1))) \ + ? macro(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1)))\ : utf8) #define _generic_LC_swash_utf8(macro, classnum, p) \ @@ -3223,7 +3223,7 @@ Perl_refcounted_he_fetch_pvn(pTHX_ const struct refcounted_he *chain, } else { p++; - *q = (char) TWO_BYTE_UTF8_TO_NATIVE(c, *p); + *q = (char) EIGHT_BIT_UTF8_TO_NATIVE(c, *p); } } } @@ -3399,7 +3399,7 @@ Perl_refcounted_he_new_pvn(pTHX_ struct refcounted_he *parent, } else { p++; - *q = (char) TWO_BYTE_UTF8_TO_NATIVE(c, *p); + *q = (char) EIGHT_BIT_UTF8_TO_NATIVE(c, *p); } } } @@ -4349,7 +4349,7 @@ PP(pp_quotemeta) IN_LC_RUNTIME(LC_CTYPE) || #endif - _isQUOTEMETA(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s + 1)))) + _isQUOTEMETA(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s + 1)))) { to_quote = TRUE; } @@ -12854,7 +12854,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { - if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE( + if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE( *s, *(s+1)))) { break; @@ -492,7 +492,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) } else if (UTF8_IS_DOWNGRADEABLE_START(*character)) { return isFOO_lc(classnum, - TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1))); + EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1))); } _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character)); @@ -2329,7 +2329,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, classnum))) || (UTF8_IS_DOWNGRADEABLE_START(*s) && to_complement ^ cBOOL( - _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s, + _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s + 1)), classnum)))) { @@ -5386,7 +5386,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) l++; } else { - if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s) + if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s) { sayNO; } @@ -5410,7 +5410,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s++; } else { - if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l) + if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l) { sayNO; } @@ -5783,7 +5783,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), - (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr, + (U8) EIGHT_BIT_UTF8_TO_NATIVE(nextchr, *(locinput + 1)))))) { sayNO; @@ -5864,7 +5864,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { if (! (to_complement - ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr, + ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr, *(locinput + 1)), FLAGS(scan))))) { @@ -8141,7 +8141,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* Target isn't utf8; convert the character in the UTF-8 * pattern to non-UTF8, and do a simple loop */ - c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1)); + c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1)); while (scan < loceol && UCHARAT(scan) == c) { scan++; } @@ -8385,7 +8385,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) { if (! (to_complement - ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan, + ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*scan, *(scan + 1)), classnum)))) { @@ -1033,7 +1033,7 @@ Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags) } else { assert(p < e -1 ); - *bufptr++ = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)); + *bufptr++ = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)); p += 2; } } @@ -2603,7 +2603,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e) } s++; } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { - if (! isALPHAU(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)))) { + if (! isALPHAU(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)))) { goto bad_charname; } s += 2; @@ -2633,7 +2633,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e) s++; } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { - if (! isCHARNAME_CONT(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)))) + if (! isCHARNAME_CONT(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)))) { goto bad_charname; } @@ -1057,7 +1057,7 @@ Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen) if (u < uend) { U8 c1 = *u++; if (UTF8_IS_CONTINUATION(c1)) { - c = TWO_BYTE_UTF8_TO_NATIVE(c, c1); + c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1); } else { Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "Malformed UTF-8 character " @@ -1133,7 +1133,7 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len) U8 c = *s++; if (! UTF8_IS_INVARIANT(c)) { /* Then it is two-byte encoded */ - c = TWO_BYTE_UTF8_TO_NATIVE(c, *s); + c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s); s++; } *d++ = c; @@ -1190,7 +1190,7 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8) U8 c = *s++; if (! UTF8_IS_INVARIANT(c)) { /* Then it is two-byte encoded */ - c = TWO_BYTE_UTF8_TO_NATIVE(c, *s); + c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s); s++; } *d++ = c; @@ -1971,11 +1971,11 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags } else if UTF8_IS_DOWNGRADEABLE_START(*p) { if (flags) { - U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)); + U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)); result = toUPPER_LC(c); } else { - return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)), + return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)), ustrp, lenp, 'S'); } } @@ -2042,11 +2042,11 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags } else if UTF8_IS_DOWNGRADEABLE_START(*p) { if (flags) { - U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)); + U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)); result = toUPPER_LC(c); } else { - return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)), + return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)), ustrp, lenp, 's'); } } @@ -2112,11 +2112,11 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags } else if UTF8_IS_DOWNGRADEABLE_START(*p) { if (flags) { - U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)); + U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)); result = toLOWER_LC(c); } else { - return to_lower_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)), + return to_lower_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)), ustrp, lenp); } } @@ -2194,11 +2194,11 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) } else if UTF8_IS_DOWNGRADEABLE_START(*p) { if (flags & FOLD_FLAGS_LOCALE) { - U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)); + U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)); result = toFOLD_LC(c); } else { - return _to_fold_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)), + return _to_fold_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)), ustrp, lenp, flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); } @@ -2723,7 +2723,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) else if (UTF8_IS_DOWNGRADEABLE_START(c)) { klen = 0; needents = 256; - off = TWO_BYTE_UTF8_TO_NATIVE(c, *(ptr + 1)); + off = EIGHT_BIT_UTF8_TO_NATIVE(c, *(ptr + 1)); } else { klen = UTF8SKIP(ptr) - 1; @@ -350,6 +350,21 @@ encoded as UTF-8. C<cp> is a native (ASCII or EBCDIC) code point if less than /* Longer, but more accurate name */ #define UTF8_IS_ABOVE_LATIN1_START(c) UTF8_IS_ABOVE_LATIN1(c) +/* Convert a UTF-8 variant Latin1 character to a native code point value. + * Needs just one iteration of accumulate. Should be used only if it is known + * that the code point is < 256, and is not UTF-8 invariant. Use the slower + * but more general TWO_BYTE_UTF8_TO_NATIVE() which handles any code point + * representable by two bytes (which turns out to be up through + * MAX_PORTABLE_UTF8_TWO_BYTE). The two parameters are: + * HI: a downgradable start byte; + * LO: continuation. + * */ +#define EIGHT_BIT_UTF8_TO_NATIVE(HI, LO) \ + ( __ASSERT_(UTF8_IS_DOWNGRADEABLE_START(HI)) \ + __ASSERT_(UTF8_IS_CONTINUATION(LO)) \ + LATIN1_TO_NATIVE(UTF8_ACCUMULATE(( \ + NATIVE_UTF8_TO_I8(HI) & UTF_START_MASK(2)), (LO)))) + /* Convert a two (not one) byte utf8 character to a native code point value. * Needs just one iteration of accumulate. Should not be used unless it is * known that the two bytes are legal: 1) two-byte start, and 2) continuation. |