summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--handy.h4
-rw-r--r--hv.c4
-rw-r--r--pp.c2
-rw-r--r--regcomp.c2
-rw-r--r--regexec.c16
-rw-r--r--toke.c6
-rw-r--r--utf8.c24
-rw-r--r--utf8.h15
8 files changed, 44 insertions, 29 deletions
diff --git a/handy.h b/handy.h
index 76dddc0a66..c3fd77d456 100644
--- a/handy.h
+++ b/handy.h
@@ -1579,7 +1579,7 @@ END_EXTERN_C
? _generic_isCC(*(p), classnum) \
: (UTF8_IS_DOWNGRADEABLE_START(*(p))) \
? _generic_isCC( \
- TWO_BYTE_UTF8_TO_NATIVE(*(p), \
+ EIGHT_BIT_UTF8_TO_NATIVE(*(p), \
*((p)+1 )), \
classnum) \
: utf8)
@@ -1665,7 +1665,7 @@ END_EXTERN_C
(UTF8_IS_INVARIANT(*(p)) \
? macro(*(p)) \
: (UTF8_IS_DOWNGRADEABLE_START(*(p))) \
- ? macro(TWO_BYTE_UTF8_TO_NATIVE(*(p), *((p)+1))) \
+ ? macro(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1)))\
: utf8)
#define _generic_LC_swash_utf8(macro, classnum, p) \
diff --git a/hv.c b/hv.c
index 9271901395..253cad9838 100644
--- a/hv.c
+++ b/hv.c
@@ -3223,7 +3223,7 @@ Perl_refcounted_he_fetch_pvn(pTHX_ const struct refcounted_he *chain,
}
else {
p++;
- *q = (char) TWO_BYTE_UTF8_TO_NATIVE(c, *p);
+ *q = (char) EIGHT_BIT_UTF8_TO_NATIVE(c, *p);
}
}
}
@@ -3399,7 +3399,7 @@ Perl_refcounted_he_new_pvn(pTHX_ struct refcounted_he *parent,
}
else {
p++;
- *q = (char) TWO_BYTE_UTF8_TO_NATIVE(c, *p);
+ *q = (char) EIGHT_BIT_UTF8_TO_NATIVE(c, *p);
}
}
}
diff --git a/pp.c b/pp.c
index 7e5dce181c..9dd3048b5c 100644
--- a/pp.c
+++ b/pp.c
@@ -4349,7 +4349,7 @@ PP(pp_quotemeta)
IN_LC_RUNTIME(LC_CTYPE)
||
#endif
- _isQUOTEMETA(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s + 1))))
+ _isQUOTEMETA(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s + 1))))
{
to_quote = TRUE;
}
diff --git a/regcomp.c b/regcomp.c
index d4c51e9359..ccbccf8faa 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -12854,7 +12854,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
}
}
else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
- if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
+ if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
*s, *(s+1))))
{
break;
diff --git a/regexec.c b/regexec.c
index 7d323d5963..c88f46759c 100644
--- a/regexec.c
+++ b/regexec.c
@@ -492,7 +492,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
}
else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
return isFOO_lc(classnum,
- TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
+ EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
}
_CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
@@ -2329,7 +2329,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
classnum)))
|| (UTF8_IS_DOWNGRADEABLE_START(*s)
&& to_complement ^ cBOOL(
- _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
+ _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
*(s + 1)),
classnum))))
{
@@ -5386,7 +5386,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
l++;
}
else {
- if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
+ if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
{
sayNO;
}
@@ -5410,7 +5410,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
s++;
}
else {
- if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
+ if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
{
sayNO;
}
@@ -5783,7 +5783,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
}
else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
- (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
+ (U8) EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
*(locinput + 1))))))
{
sayNO;
@@ -5864,7 +5864,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
}
else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
if (! (to_complement
- ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
+ ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
*(locinput + 1)),
FLAGS(scan)))))
{
@@ -8141,7 +8141,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
/* Target isn't utf8; convert the character in the UTF-8
* pattern to non-UTF8, and do a simple loop */
- c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
+ c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
while (scan < loceol && UCHARAT(scan) == c) {
scan++;
}
@@ -8385,7 +8385,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
}
else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
if (! (to_complement
- ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
+ ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*scan,
*(scan + 1)),
classnum))))
{
diff --git a/toke.c b/toke.c
index ef208f2be1..70318a7609 100644
--- a/toke.c
+++ b/toke.c
@@ -1033,7 +1033,7 @@ Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
}
else {
assert(p < e -1 );
- *bufptr++ = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
+ *bufptr++ = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
p += 2;
}
}
@@ -2603,7 +2603,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
}
s++;
} else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
- if (! isALPHAU(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)))) {
+ if (! isALPHAU(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)))) {
goto bad_charname;
}
s += 2;
@@ -2633,7 +2633,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
s++;
}
else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
- if (! isCHARNAME_CONT(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1))))
+ if (! isCHARNAME_CONT(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1))))
{
goto bad_charname;
}
diff --git a/utf8.c b/utf8.c
index 5d4a7cef5f..2a9d20e794 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1057,7 +1057,7 @@ Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
if (u < uend) {
U8 c1 = *u++;
if (UTF8_IS_CONTINUATION(c1)) {
- c = TWO_BYTE_UTF8_TO_NATIVE(c, c1);
+ c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
} else {
Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
"Malformed UTF-8 character "
@@ -1133,7 +1133,7 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
U8 c = *s++;
if (! UTF8_IS_INVARIANT(c)) {
/* Then it is two-byte encoded */
- c = TWO_BYTE_UTF8_TO_NATIVE(c, *s);
+ c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
s++;
}
*d++ = c;
@@ -1190,7 +1190,7 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
U8 c = *s++;
if (! UTF8_IS_INVARIANT(c)) {
/* Then it is two-byte encoded */
- c = TWO_BYTE_UTF8_TO_NATIVE(c, *s);
+ c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
s++;
}
*d++ = c;
@@ -1971,11 +1971,11 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags
}
else if UTF8_IS_DOWNGRADEABLE_START(*p) {
if (flags) {
- U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
+ U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
result = toUPPER_LC(c);
}
else {
- return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
+ return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
ustrp, lenp, 'S');
}
}
@@ -2042,11 +2042,11 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags
}
else if UTF8_IS_DOWNGRADEABLE_START(*p) {
if (flags) {
- U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
+ U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
result = toUPPER_LC(c);
}
else {
- return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
+ return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
ustrp, lenp, 's');
}
}
@@ -2112,11 +2112,11 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags
}
else if UTF8_IS_DOWNGRADEABLE_START(*p) {
if (flags) {
- U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
+ U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
result = toLOWER_LC(c);
}
else {
- return to_lower_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
+ return to_lower_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
ustrp, lenp);
}
}
@@ -2194,11 +2194,11 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
}
else if UTF8_IS_DOWNGRADEABLE_START(*p) {
if (flags & FOLD_FLAGS_LOCALE) {
- U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
+ U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
result = toFOLD_LC(c);
}
else {
- return _to_fold_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
+ return _to_fold_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
ustrp, lenp,
flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
}
@@ -2723,7 +2723,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
klen = 0;
needents = 256;
- off = TWO_BYTE_UTF8_TO_NATIVE(c, *(ptr + 1));
+ off = EIGHT_BIT_UTF8_TO_NATIVE(c, *(ptr + 1));
}
else {
klen = UTF8SKIP(ptr) - 1;
diff --git a/utf8.h b/utf8.h
index 271796b881..85bf590162 100644
--- a/utf8.h
+++ b/utf8.h
@@ -350,6 +350,21 @@ encoded as UTF-8. C<cp> is a native (ASCII or EBCDIC) code point if less than
/* Longer, but more accurate name */
#define UTF8_IS_ABOVE_LATIN1_START(c) UTF8_IS_ABOVE_LATIN1(c)
+/* Convert a UTF-8 variant Latin1 character to a native code point value.
+ * Needs just one iteration of accumulate. Should be used only if it is known
+ * that the code point is < 256, and is not UTF-8 invariant. Use the slower
+ * but more general TWO_BYTE_UTF8_TO_NATIVE() which handles any code point
+ * representable by two bytes (which turns out to be up through
+ * MAX_PORTABLE_UTF8_TWO_BYTE). The two parameters are:
+ * HI: a downgradable start byte;
+ * LO: continuation.
+ * */
+#define EIGHT_BIT_UTF8_TO_NATIVE(HI, LO) \
+ ( __ASSERT_(UTF8_IS_DOWNGRADEABLE_START(HI)) \
+ __ASSERT_(UTF8_IS_CONTINUATION(LO)) \
+ LATIN1_TO_NATIVE(UTF8_ACCUMULATE(( \
+ NATIVE_UTF8_TO_I8(HI) & UTF_START_MASK(2)), (LO))))
+
/* Convert a two (not one) byte utf8 character to a native code point value.
* Needs just one iteration of accumulate. Should not be used unless it is
* known that the two bytes are legal: 1) two-byte start, and 2) continuation.