diff options
-rw-r--r-- | doop.c | 57 | ||||
-rw-r--r-- | embed.h | 36 | ||||
-rwxr-xr-x | embed.pl | 9 | ||||
-rw-r--r-- | ext/Encode/Encode.xs | 4 | ||||
-rw-r--r-- | global.sym | 9 | ||||
-rw-r--r-- | handy.h | 60 | ||||
-rw-r--r-- | objXSUB.h | 36 | ||||
-rw-r--r-- | op.c | 31 | ||||
-rw-r--r-- | perl.c | 9 | ||||
-rw-r--r-- | perlapi.c | 39 | ||||
-rw-r--r-- | pp.c | 49 | ||||
-rw-r--r-- | pp_ctl.c | 8 | ||||
-rw-r--r-- | proto.h | 9 | ||||
-rw-r--r-- | regcomp.c | 22 | ||||
-rw-r--r-- | regexec.c | 190 | ||||
-rw-r--r-- | sv.c | 14 | ||||
-rw-r--r-- | toke.c | 19 | ||||
-rw-r--r-- | utf8.c | 152 | ||||
-rw-r--r-- | utf8.h | 18 |
19 files changed, 449 insertions, 322 deletions
@@ -66,10 +66,10 @@ S_do_trans_simple(pTHX_ SV *sv) UV c; /* Need to check this, otherwise 128..255 won't match */ - c = utf8_to_uv(s, send - s, &ulen, 0); + c = utf8n_to_uvchr(s, send - s, &ulen, 0); if (c < 0x100 && (ch = tbl[c]) >= 0) { matches++; - d = uv_to_utf8(d, ch); + d = uvchr_to_utf8(d, ch); s += ulen; } else { /* No match -> copy */ @@ -117,7 +117,7 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */ while (s < send) { UV c; STRLEN ulen; - c = utf8_to_uv(s, send - s, &ulen, 0); + c = utf8n_to_uvchr(s, send - s, &ulen, 0); if (c < 0x100) { if (tbl[c] >= 0) matches++; @@ -203,7 +203,7 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */ UV pch = 0xfeedface; while (s < send) { STRLEN len; - UV comp = utf8_to_uv_simple(s, &len); + UV comp = utf8_to_uvchr(s, &len); if (comp > 0xff) { if (!complement) { @@ -216,7 +216,7 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */ ch = (comp - 0x100 < rlen) ? tbl[comp+1] : tbl[0x100+rlen]; if (ch != pch) { - d = uv_to_utf8(d, ch); + d = uvchr_to_utf8(d, ch); pch = ch; } s += len; @@ -227,7 +227,7 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */ else if ((ch = tbl[comp]) >= 0) { matches++; if (ch != pch) { - d = uv_to_utf8(d, ch); + d = uvchr_to_utf8(d, ch); pch = ch; } s += len; @@ -246,7 +246,7 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */ else { while (s < send) { STRLEN len; - UV comp = utf8_to_uv_simple(s, &len); + UV comp = utf8_to_uvchr(s, &len); if (comp > 0xff) { if (!complement) { Copy(s, d, len, U8); @@ -255,15 +255,15 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */ else { matches++; if (!del) { - if (comp - 0x100 < rlen) - d = uv_to_utf8(d, tbl[comp+1]); + if (comp - 0x100 < rlen) + d = uvchr_to_utf8(d, tbl[comp+1]); else - d = uv_to_utf8(d, tbl[0x100+rlen]); + d = uvchr_to_utf8(d, tbl[0x100+rlen]); } } } else if ((ch = tbl[comp]) >= 0) { - d = uv_to_utf8(d, ch); + d = uvchr_to_utf8(d, ch); matches++; } else if (ch == -1) { /* -1 is unmapped character */ @@ -343,7 +343,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */ if ((uv = swash_fetch(rv, s)) < none) { s += UTF8SKIP(s); matches++; - d = uv_to_utf8(d, uv); + d = uvchr_to_utf8(d, uv); } else if (uv == none) { int i = UTF8SKIP(s); @@ -355,7 +355,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */ int i = UTF8SKIP(s); s += i; matches++; - d = uv_to_utf8(d, final); + d = uvchr_to_utf8(d, final); } else s += UTF8SKIP(s); @@ -382,6 +382,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */ } SvSETMAGIC(sv); SvUTF8_on(sv); + /* Downgrading just 'cos it will is suspect - NI-S */ if (!isutf8 && !(PL_hints & HINT_UTF8)) sv_utf8_downgrade(sv, TRUE); @@ -479,7 +480,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ UV puv = 0xfeedface; while (s < send) { uv = swash_fetch(rv, s); - + if (d > dend) { STRLEN clen = d - dstart; STRLEN nlen = dend - dstart + len + UTF8_MAXLEN; @@ -493,7 +494,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ matches++; s += UTF8SKIP(s); if (uv != puv) { - d = uv_to_utf8(d, uv); + d = uvchr_to_utf8(d, uv); puv = uv; } continue; @@ -510,7 +511,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ matches++; s += UTF8SKIP(s); if (uv != puv) { - d = uv_to_utf8(d, final); + d = uvchr_to_utf8(d, final); puv = final; } continue; @@ -534,7 +535,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ if (uv < none) { matches++; s += UTF8SKIP(s); - d = uv_to_utf8(d, uv); + d = uvchr_to_utf8(d, uv); continue; } else if (uv == none) { /* "none" is unmapped character */ @@ -547,7 +548,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ else if (uv == extra && !del) { matches++; s += UTF8SKIP(s); - d = uv_to_utf8(d, final); + d = uvchr_to_utf8(d, final); continue; } matches++; /* "none+1" is delete character */ @@ -934,7 +935,7 @@ Perl_do_chop(pTHX_ register SV *astr, register SV *sv) s = send - 1; while (s > start && UTF8_IS_CONTINUATION(*s)) s--; - if (utf8_to_uv_simple((U8*)s, 0)) { + if (utf8_to_uvchr((U8*)s, 0)) { sv_setpvn(astr, s, send - s); *s = '\0'; SvCUR_set(sv, s - start); @@ -1100,14 +1101,14 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right) switch (optype) { case OP_BIT_AND: while (lulen && rulen) { - luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV); + luc = utf8n_to_uvchr((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV); lc += ulen; lulen -= ulen; - ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV); + ruc = utf8n_to_uvchr((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV); rc += ulen; rulen -= ulen; duc = luc & ruc; - dc = (char*)uv_to_utf8((U8*)dc, duc); + dc = (char*)uvchr_to_utf8((U8*)dc, duc); } if (sv == left || sv == right) (void)sv_usepvn(sv, dcsave, needlen); @@ -1115,26 +1116,26 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right) break; case OP_BIT_XOR: while (lulen && rulen) { - luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV); + luc = utf8n_to_uvchr((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV); lc += ulen; lulen -= ulen; - ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV); + ruc = utf8n_to_uvchr((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV); rc += ulen; rulen -= ulen; duc = luc ^ ruc; - dc = (char*)uv_to_utf8((U8*)dc, duc); + dc = (char*)uvchr_to_utf8((U8*)dc, duc); } goto mop_up_utf; case OP_BIT_OR: while (lulen && rulen) { - luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV); + luc = utf8n_to_uvchr((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV); lc += ulen; lulen -= ulen; - ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV); + ruc = utf8n_to_uvchr((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV); rc += ulen; rulen -= ulen; duc = luc | ruc; - dc = (char*)uv_to_utf8((U8*)dc, duc); + dc = (char*)uvchr_to_utf8((U8*)dc, duc); } mop_up_utf: if (sv == left || sv == right) @@ -740,9 +740,12 @@ #define utf8_to_bytes Perl_utf8_to_bytes #define bytes_from_utf8 Perl_bytes_from_utf8 #define bytes_to_utf8 Perl_bytes_to_utf8 -#define utf8_to_uv_simple Perl_utf8_to_uv_simple -#define utf8_to_uv Perl_utf8_to_uv -#define uv_to_utf8 Perl_uv_to_utf8 +#define utf8_to_uvchr Perl_utf8_to_uvchr +#define utf8_to_uvuni Perl_utf8_to_uvuni +#define utf8n_to_uvchr Perl_utf8n_to_uvchr +#define utf8n_to_uvuni Perl_utf8n_to_uvuni +#define uvchr_to_utf8 Perl_uvchr_to_utf8 +#define uvuni_to_utf8 Perl_uvuni_to_utf8 #define vivify_defelem Perl_vivify_defelem #define vivify_ref Perl_vivify_ref #define wait4pid Perl_wait4pid @@ -2226,9 +2229,12 @@ #define utf8_to_bytes(a,b) Perl_utf8_to_bytes(aTHX_ a,b) #define bytes_from_utf8(a,b,c) Perl_bytes_from_utf8(aTHX_ a,b,c) #define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b) -#define utf8_to_uv_simple(a,b) Perl_utf8_to_uv_simple(aTHX_ a,b) -#define utf8_to_uv(a,b,c,d) Perl_utf8_to_uv(aTHX_ a,b,c,d) -#define uv_to_utf8(a,b) Perl_uv_to_utf8(aTHX_ a,b) +#define utf8_to_uvchr(a,b) Perl_utf8_to_uvchr(aTHX_ a,b) +#define utf8_to_uvuni(a,b) Perl_utf8_to_uvuni(aTHX_ a,b) +#define utf8n_to_uvchr(a,b,c,d) Perl_utf8n_to_uvchr(aTHX_ a,b,c,d) +#define utf8n_to_uvuni(a,b,c,d) Perl_utf8n_to_uvuni(aTHX_ a,b,c,d) +#define uvchr_to_utf8(a,b) Perl_uvchr_to_utf8(aTHX_ a,b) +#define uvuni_to_utf8(a,b) Perl_uvuni_to_utf8(aTHX_ a,b) #define vivify_defelem(a) Perl_vivify_defelem(aTHX_ a) #define vivify_ref(a,b) Perl_vivify_ref(aTHX_ a,b) #define wait4pid(a,b,c) Perl_wait4pid(aTHX_ a,b,c) @@ -4366,12 +4372,18 @@ #define bytes_from_utf8 Perl_bytes_from_utf8 #define Perl_bytes_to_utf8 CPerlObj::Perl_bytes_to_utf8 #define bytes_to_utf8 Perl_bytes_to_utf8 -#define Perl_utf8_to_uv_simple CPerlObj::Perl_utf8_to_uv_simple -#define utf8_to_uv_simple Perl_utf8_to_uv_simple -#define Perl_utf8_to_uv CPerlObj::Perl_utf8_to_uv -#define utf8_to_uv Perl_utf8_to_uv -#define Perl_uv_to_utf8 CPerlObj::Perl_uv_to_utf8 -#define uv_to_utf8 Perl_uv_to_utf8 +#define Perl_utf8_to_uvchr CPerlObj::Perl_utf8_to_uvchr +#define utf8_to_uvchr Perl_utf8_to_uvchr +#define Perl_utf8_to_uvuni CPerlObj::Perl_utf8_to_uvuni +#define utf8_to_uvuni Perl_utf8_to_uvuni +#define Perl_utf8n_to_uvchr CPerlObj::Perl_utf8n_to_uvchr +#define utf8n_to_uvchr Perl_utf8n_to_uvchr +#define Perl_utf8n_to_uvuni CPerlObj::Perl_utf8n_to_uvuni +#define utf8n_to_uvuni Perl_utf8n_to_uvuni +#define Perl_uvchr_to_utf8 CPerlObj::Perl_uvchr_to_utf8 +#define uvchr_to_utf8 Perl_uvchr_to_utf8 +#define Perl_uvuni_to_utf8 CPerlObj::Perl_uvuni_to_utf8 +#define uvuni_to_utf8 Perl_uvuni_to_utf8 #define Perl_vivify_defelem CPerlObj::Perl_vivify_defelem #define vivify_defelem Perl_vivify_defelem #define Perl_vivify_ref CPerlObj::Perl_vivify_ref @@ -2094,9 +2094,12 @@ Apd |U8* |utf8_hop |U8 *s|I32 off ApMd |U8* |utf8_to_bytes |U8 *s|STRLEN *len ApMd |U8* |bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8 ApMd |U8* |bytes_to_utf8 |U8 *s|STRLEN *len -Apd |UV |utf8_to_uv_simple|U8 *s|STRLEN* retlen -Adp |UV |utf8_to_uv |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags -Apd |U8* |uv_to_utf8 |U8 *d|UV uv +Apd |UV |utf8_to_uvchr |U8 *s|STRLEN* retlen +Apd |UV |utf8_to_uvuni |U8 *s|STRLEN* retlen +Adp |UV |utf8n_to_uvchr |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags +Adp |UV |utf8n_to_uvuni |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags +Apd |U8* |uvchr_to_utf8 |U8 *d|UV uv +Apd |U8* |uvuni_to_utf8 |U8 *d|UV uv p |void |vivify_defelem |SV* sv p |void |vivify_ref |SV* sv|U32 to_what p |I32 |wait4pid |Pid_t pid|int* statusp|int flags diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index 584849ac47..1332adab7c 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -385,8 +385,8 @@ encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check) if (!check && ckWARN_d(WARN_UTF8)) { STRLEN clen; - UV ch = utf8_to_uv(s+slen,(SvCUR(src)-slen),&clen,0); - Perl_warner(aTHX_ WARN_UTF8, "\"\\x{%"UVxf"}\" does not map to %s", ch, enc->name[0]); + UV ch = utf8n_to_uvuni(s+slen,(SvCUR(src)-slen),&clen,0); + Perl_warner(aTHX_ WARN_UTF8, "\"\\N{U+%"UVxf"}\" does not map to %s", ch, enc->name[0]); /* FIXME: Skip over the character, copy in replacement and continue * but that is messy so for now just fail. */ diff --git a/global.sym b/global.sym index cdc36aaaf8..870fccfa3d 100644 --- a/global.sym +++ b/global.sym @@ -473,9 +473,12 @@ Perl_utf8_hop Perl_utf8_to_bytes Perl_bytes_from_utf8 Perl_bytes_to_utf8 -Perl_utf8_to_uv_simple -Perl_utf8_to_uv -Perl_uv_to_utf8 +Perl_utf8_to_uvchr +Perl_utf8_to_uvuni +Perl_utf8n_to_uvchr +Perl_utf8n_to_uvuni +Perl_uvchr_to_utf8 +Perl_uvuni_to_utf8 Perl_warn Perl_vwarn Perl_warner @@ -423,21 +423,21 @@ Converts the specified character to lowercase. #define isPSXSPC_uni(c) (isSPACE_uni(c) ||(c) == '\f') #define isBLANK_uni(c) isBLANK(c) /* could be wrong */ -#define isALNUM_LC_uni(c) (c < 256 ? isALNUM_LC(c) : is_uni_alnum_lc(c)) -#define isIDFIRST_LC_uni(c) (c < 256 ? isIDFIRST_LC(c) : is_uni_idfirst_lc(c)) -#define isALPHA_LC_uni(c) (c < 256 ? isALPHA_LC(c) : is_uni_alpha_lc(c)) -#define isSPACE_LC_uni(c) (c < 256 ? isSPACE_LC(c) : is_uni_space_lc(c)) -#define isDIGIT_LC_uni(c) (c < 256 ? isDIGIT_LC(c) : is_uni_digit_lc(c)) -#define isUPPER_LC_uni(c) (c < 256 ? isUPPER_LC(c) : is_uni_upper_lc(c)) -#define isLOWER_LC_uni(c) (c < 256 ? isLOWER_LC(c) : is_uni_lower_lc(c)) -#define isALNUMC_LC_uni(c) (c < 256 ? isALNUMC_LC(c) : is_uni_alnumc_lc(c)) -#define isCNTRL_LC_uni(c) (c < 256 ? isCNTRL_LC(c) : is_uni_cntrl_lc(c)) -#define isGRAPH_LC_uni(c) (c < 256 ? isGRAPH_LC(c) : is_uni_graph_lc(c)) -#define isPRINT_LC_uni(c) (c < 256 ? isPRINT_LC(c) : is_uni_print_lc(c)) -#define isPUNCT_LC_uni(c) (c < 256 ? isPUNCT_LC(c) : is_uni_punct_lc(c)) -#define toUPPER_LC_uni(c) (c < 256 ? toUPPER_LC(c) : to_uni_upper_lc(c)) -#define toTITLE_LC_uni(c) (c < 256 ? toUPPER_LC(c) : to_uni_title_lc(c)) -#define toLOWER_LC_uni(c) (c < 256 ? toLOWER_LC(c) : to_uni_lower_lc(c)) +#define isALNUM_LC_uvchr(c) (c < 256 ? isALNUM_LC(c) : is_uni_alnum_lc(c)) +#define isIDFIRST_LC_uvchr(c) (c < 256 ? isIDFIRST_LC(c) : is_uni_idfirst_lc(c)) +#define isALPHA_LC_uvchr(c) (c < 256 ? isALPHA_LC(c) : is_uni_alpha_lc(c)) +#define isSPACE_LC_uvchr(c) (c < 256 ? isSPACE_LC(c) : is_uni_space_lc(c)) +#define isDIGIT_LC_uvchr(c) (c < 256 ? isDIGIT_LC(c) : is_uni_digit_lc(c)) +#define isUPPER_LC_uvchr(c) (c < 256 ? isUPPER_LC(c) : is_uni_upper_lc(c)) +#define isLOWER_LC_uvchr(c) (c < 256 ? isLOWER_LC(c) : is_uni_lower_lc(c)) +#define isALNUMC_LC_uvchr(c) (c < 256 ? isALNUMC_LC(c) : is_uni_alnumc_lc(c)) +#define isCNTRL_LC_uvchr(c) (c < 256 ? isCNTRL_LC(c) : is_uni_cntrl_lc(c)) +#define isGRAPH_LC_uvchr(c) (c < 256 ? isGRAPH_LC(c) : is_uni_graph_lc(c)) +#define isPRINT_LC_uvchr(c) (c < 256 ? isPRINT_LC(c) : is_uni_print_lc(c)) +#define isPUNCT_LC_uvchr(c) (c < 256 ? isPUNCT_LC(c) : is_uni_punct_lc(c)) +#define toUPPER_LC_uvchr(c) (c < 256 ? toUPPER_LC(c) : to_uni_upper_lc(c)) +#define toTITLE_LC_uvchr(c) (c < 256 ? toUPPER_LC(c) : to_uni_title_lc(c)) +#define toLOWER_LC_uvchr(c) (c < 256 ? toLOWER_LC(c) : to_uni_lower_lc(c)) #define isPSXSPC_LC_uni(c) (isSPACE_LC_uni(c) ||(c) == '\f') #define isBLANK_LC_uni(c) isBLANK(c) /* could be wrong */ @@ -463,21 +463,21 @@ Converts the specified character to lowercase. #define isPSXSPC_utf8(c) (isSPACE_utf8(c) ||(c) == '\f') #define isBLANK_utf8(c) isBLANK(c) /* could be wrong */ -#define isALNUM_LC_utf8(p) isALNUM_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0)) -#define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0)) -#define isALPHA_LC_utf8(p) isALPHA_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0)) -#define isSPACE_LC_utf8(p) isSPACE_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0)) -#define isDIGIT_LC_utf8(p) isDIGIT_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0)) -#define isUPPER_LC_utf8(p) isUPPER_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0)) -#define isLOWER_LC_utf8(p) isLOWER_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0)) -#define isALNUMC_LC_utf8(p) isALNUMC_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0)) -#define isCNTRL_LC_utf8(p) isCNTRL_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0)) -#define isGRAPH_LC_utf8(p) isGRAPH_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0)) -#define isPRINT_LC_utf8(p) isPRINT_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0)) -#define isPUNCT_LC_utf8(p) isPUNCT_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0)) -#define toUPPER_LC_utf8(p) toUPPER_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0)) -#define toTITLE_LC_utf8(p) toTITLE_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0)) -#define toLOWER_LC_utf8(p) toLOWER_LC_uni(utf8_to_uv(p, UTF8_MAXLEN, 0, 0)) +#define isALNUM_LC_utf8(p) isALNUM_LC_uvchr(utf8_to_uvchr(p, 0)) +#define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uvchr(utf8_to_uvchr(p, 0)) +#define isALPHA_LC_utf8(p) isALPHA_LC_uvchr(utf8_to_uvchr(p, 0)) +#define isSPACE_LC_utf8(p) isSPACE_LC_uvchr(utf8_to_uvchr(p, 0)) +#define isDIGIT_LC_utf8(p) isDIGIT_LC_uvchr(utf8_to_uvchr(p, 0)) +#define isUPPER_LC_utf8(p) isUPPER_LC_uvchr(utf8_to_uvchr(p, 0)) +#define isLOWER_LC_utf8(p) isLOWER_LC_uvchr(utf8_to_uvchr(p, 0)) +#define isALNUMC_LC_utf8(p) isALNUMC_LC_uvchr(utf8_to_uvchr(p, 0)) +#define isCNTRL_LC_utf8(p) isCNTRL_LC_uvchr(utf8_to_uvchr(p, 0)) +#define isGRAPH_LC_utf8(p) isGRAPH_LC_uvchr(utf8_to_uvchr(p, 0)) +#define isPRINT_LC_utf8(p) isPRINT_LC_uvchr(utf8_to_uvchr(p, 0)) +#define isPUNCT_LC_utf8(p) isPUNCT_LC_uvchr(utf8_to_uvchr(p, 0)) +#define toUPPER_LC_utf8(p) toUPPER_LC_uvchr(utf8_to_uvchr(p, 0)) +#define toTITLE_LC_utf8(p) toTITLE_LC_uvchr(utf8_to_uvchr(p, 0)) +#define toLOWER_LC_utf8(p) toLOWER_LC_uvchr(utf8_to_uvchr(p, 0)) #define isPSXSPC_LC_utf8(c) (isSPACE_LC_utf8(c) ||(c) == '\f') #define isBLANK_LC_utf8(c) isBLANK(c) /* could be wrong */ @@ -1901,18 +1901,30 @@ #define Perl_bytes_to_utf8 pPerl->Perl_bytes_to_utf8 #undef bytes_to_utf8 #define bytes_to_utf8 Perl_bytes_to_utf8 -#undef Perl_utf8_to_uv_simple -#define Perl_utf8_to_uv_simple pPerl->Perl_utf8_to_uv_simple -#undef utf8_to_uv_simple -#define utf8_to_uv_simple Perl_utf8_to_uv_simple -#undef Perl_utf8_to_uv -#define Perl_utf8_to_uv pPerl->Perl_utf8_to_uv -#undef utf8_to_uv -#define utf8_to_uv Perl_utf8_to_uv -#undef Perl_uv_to_utf8 -#define Perl_uv_to_utf8 pPerl->Perl_uv_to_utf8 -#undef uv_to_utf8 -#define uv_to_utf8 Perl_uv_to_utf8 +#undef Perl_utf8_to_uvchr +#define Perl_utf8_to_uvchr pPerl->Perl_utf8_to_uvchr +#undef utf8_to_uvchr +#define utf8_to_uvchr Perl_utf8_to_uvchr +#undef Perl_utf8_to_uvuni +#define Perl_utf8_to_uvuni pPerl->Perl_utf8_to_uvuni +#undef utf8_to_uvuni +#define utf8_to_uvuni Perl_utf8_to_uvuni +#undef Perl_utf8n_to_uvchr +#define Perl_utf8n_to_uvchr pPerl->Perl_utf8n_to_uvchr +#undef utf8n_to_uvchr +#define utf8n_to_uvchr Perl_utf8n_to_uvchr +#undef Perl_utf8n_to_uvuni +#define Perl_utf8n_to_uvuni pPerl->Perl_utf8n_to_uvuni +#undef utf8n_to_uvuni +#define utf8n_to_uvuni Perl_utf8n_to_uvuni +#undef Perl_uvchr_to_utf8 +#define Perl_uvchr_to_utf8 pPerl->Perl_uvchr_to_utf8 +#undef uvchr_to_utf8 +#define uvchr_to_utf8 Perl_uvchr_to_utf8 +#undef Perl_uvuni_to_utf8 +#define Perl_uvuni_to_utf8 pPerl->Perl_uvuni_to_utf8 +#undef uvuni_to_utf8 +#define uvuni_to_utf8 Perl_uvuni_to_utf8 #undef Perl_warn #define Perl_warn pPerl->Perl_warn #undef warn @@ -125,7 +125,7 @@ S_trlist_upgrade(pTHX_ U8** sp, U8** ep) *ep = d; return *sp; } - + /* "register" allocation */ @@ -1361,7 +1361,7 @@ Perl_mod(pTHX_ OP *o, I32 type) PL_modcount++; return o; case OP_CONST: - if (o->op_private & (OPpCONST_BARE) && + if (o->op_private & (OPpCONST_BARE) && !(type == OP_GREPSTART || type == OP_ENTERSUB || type == OP_REFGEN)) { SV *sv = ((SVOP*)o)->op_sv; GV *gv; @@ -1376,8 +1376,8 @@ Perl_mod(pTHX_ OP *o, I32 type) OP* enter; gv = gv_fetchpv(SvPV_nolen(sv), TRUE, SVt_PVCV); - enter = newUNOP(OP_ENTERSUB,0, - newUNOP(OP_RV2CV, 0, + enter = newUNOP(OP_ENTERSUB,0, + newUNOP(OP_RV2CV, 0, newGVOP(OP_GV, 0, gv) )); enter->op_private |= OPpLVAL_INTRO; @@ -2708,26 +2708,27 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) for (j = 0; j < i; j++) { U8 *s = cp[j]; I32 cur = j < i - 1 ? cp[j+1] - s : tend - s; - UV val = utf8_to_uv(s, cur, &ulen, 0); + /* CHECKME: Use unicode code points for ranges - needs more thought ... NI-S */ + UV val = utf8n_to_uvuni(s, cur, &ulen, 0); s += ulen; diff = val - nextmin; if (diff > 0) { - t = uv_to_utf8(tmpbuf,nextmin); + t = uvuni_to_utf8(tmpbuf,nextmin); sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf); if (diff > 1) { - t = uv_to_utf8(tmpbuf, val - 1); + t = uvuni_to_utf8(tmpbuf, val - 1); sv_catpvn(transv, "\377", 1); sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf); } } if (s < tend && *s == 0xff) - val = utf8_to_uv(s+1, cur - 1, &ulen, 0); + val = utf8n_to_uvuni(s+1, cur - 1, &ulen, 0); if (val >= nextmin) nextmin = val + 1; } - t = uv_to_utf8(tmpbuf,nextmin); + t = uvuni_to_utf8(tmpbuf,nextmin); sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf); - t = uv_to_utf8(tmpbuf, 0x7fffffff); + t = uvuni_to_utf8(tmpbuf, 0x7fffffff); sv_catpvn(transv, "\377", 1); sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf); t = (U8*)SvPVX(transv); @@ -2749,11 +2750,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) while (t < tend || tfirst <= tlast) { /* see if we need more "t" chars */ if (tfirst > tlast) { - tfirst = (I32)utf8_to_uv(t, tend - t, &ulen, 0); + tfirst = (I32)utf8n_to_uvuni(t, tend - t, &ulen, 0); t += ulen; if (t < tend && *t == 0xff) { /* illegal utf8 val indicates range */ t++; - tlast = (I32)utf8_to_uv(t, tend - t, &ulen, 0); + tlast = (I32)utf8n_to_uvuni(t, tend - t, &ulen, 0); t += ulen; } else @@ -2763,11 +2764,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) /* now see if we need more "r" chars */ if (rfirst > rlast) { if (r < rend) { - rfirst = (I32)utf8_to_uv(r, rend - r, &ulen, 0); + rfirst = (I32)utf8n_to_uvuni(r, rend - r, &ulen, 0); r += ulen; if (r < rend && *r == 0xff) { /* illegal utf8 val indicates range */ r++; - rlast = (I32)utf8_to_uv(r, rend - r, &ulen, 0); + rlast = (I32)utf8n_to_uvuni(r, rend - r, &ulen, 0); r += ulen; } else @@ -6796,7 +6797,7 @@ Perl_peep(pTHX_ register OP *o) case OP_MATCH: case OP_SUBST: o->op_seq = PL_op_seqmax++; - while (cPMOP->op_pmreplstart && + while (cPMOP->op_pmreplstart && cPMOP->op_pmreplstart->op_type == OP_NULL) cPMOP->op_pmreplstart = cPMOP->op_pmreplstart->op_next; peep(cPMOP->op_pmreplstart); @@ -253,9 +253,10 @@ perl_construct(pTHXx) if (PERL_REVISION > 127 || PERL_VERSION > 127 || PERL_SUBVERSION > 127) SvGROW(PL_patchlevel, UTF8_MAXLEN*3+1); s = (U8*)SvPVX(PL_patchlevel); - s = uv_to_utf8(s, (UV)(ASCII_TO_NATIVE(PERL_REVISION))); - s = uv_to_utf8(s, (UV)(ASCII_TO_NATIVE(PERL_VERSION))); - s = uv_to_utf8(s, (UV)(ASCII_TO_NATIVE(PERL_SUBVERSION))); + /* Build version strings using "native" characters */ + s = uvchr_to_utf8(s, (UV)PERL_REVISION); + s = uvchr_to_utf8(s, (UV)PERL_VERSION); + s = uvchr_to_utf8(s, (UV)PERL_SUBVERSION); *s = '\0'; SvCUR_set(PL_patchlevel, s - (U8*)SvPVX(PL_patchlevel)); SvPOK_on(PL_patchlevel); @@ -3357,7 +3358,7 @@ S_init_postdump_symbols(pTHX_ register int argc, register char **argv, register #ifdef NEED_ENVIRON_DUP_FOR_MODIFY { char **env_base; - for (env_base = env; *env; env++) + for (env_base = env; *env; env++) dup_env_count++; if ((dup_env_base = (char **) safesysmalloc( sizeof(char *) * (dup_env_count+1) ))) { @@ -3434,25 +3434,46 @@ Perl_bytes_to_utf8(pTHXo_ U8 *s, STRLEN *len) return ((CPerlObj*)pPerl)->Perl_bytes_to_utf8(s, len); } -#undef Perl_utf8_to_uv_simple +#undef Perl_utf8_to_uvchr UV -Perl_utf8_to_uv_simple(pTHXo_ U8 *s, STRLEN* retlen) +Perl_utf8_to_uvchr(pTHXo_ U8 *s, STRLEN* retlen) { - return ((CPerlObj*)pPerl)->Perl_utf8_to_uv_simple(s, retlen); + return ((CPerlObj*)pPerl)->Perl_utf8_to_uvchr(s, retlen); } -#undef Perl_utf8_to_uv +#undef Perl_utf8_to_uvuni UV -Perl_utf8_to_uv(pTHXo_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags) +Perl_utf8_to_uvuni(pTHXo_ U8 *s, STRLEN* retlen) { - return ((CPerlObj*)pPerl)->Perl_utf8_to_uv(s, curlen, retlen, flags); + return ((CPerlObj*)pPerl)->Perl_utf8_to_uvuni(s, retlen); } -#undef Perl_uv_to_utf8 +#undef Perl_utf8n_to_uvchr +UV +Perl_utf8n_to_uvchr(pTHXo_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags) +{ + return ((CPerlObj*)pPerl)->Perl_utf8n_to_uvchr(s, curlen, retlen, flags); +} + +#undef Perl_utf8n_to_uvuni +UV +Perl_utf8n_to_uvuni(pTHXo_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags) +{ + return ((CPerlObj*)pPerl)->Perl_utf8n_to_uvuni(s, curlen, retlen, flags); +} + +#undef Perl_uvchr_to_utf8 +U8* +Perl_uvchr_to_utf8(pTHXo_ U8 *d, UV uv) +{ + return ((CPerlObj*)pPerl)->Perl_uvchr_to_utf8(d, uv); +} + +#undef Perl_uvuni_to_utf8 U8* -Perl_uv_to_utf8(pTHXo_ U8 *d, UV uv) +Perl_uvuni_to_utf8(pTHXo_ U8 *d, UV uv) { - return ((CPerlObj*)pPerl)->Perl_uv_to_utf8(d, uv); + return ((CPerlObj*)pPerl)->Perl_uvuni_to_utf8(d, uv); } #undef Perl_warn @@ -1281,7 +1281,7 @@ PP(pp_subtract) UV result; register UV buv; bool buvok = SvUOK(TOPs); - + if (buvok) buv = SvUVX(TOPs); else { @@ -2138,7 +2138,7 @@ PP(pp_complement) send = tmps + len; while (tmps < send) { - UV c = utf8_to_uv(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV); + UV c = utf8n_to_uvchr(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV); tmps += UTF8SKIP(tmps); targlen += UNISKIP(~c); nchar++; @@ -2152,9 +2152,9 @@ PP(pp_complement) if (nwide) { Newz(0, result, targlen + 1, U8); while (tmps < send) { - UV c = utf8_to_uv(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV); + UV c = utf8n_to_uvchr(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV); tmps += UTF8SKIP(tmps); - result = uv_to_utf8(result, ~c); + result = uvchr_to_utf8(result, ~c); } *result = '\0'; result -= targlen; @@ -2164,7 +2164,7 @@ PP(pp_complement) else { Newz(0, result, nchar + 1, U8); while (tmps < send) { - U8 c = (U8)utf8_to_uv(tmps, 0, &l, UTF8_ALLOW_ANY); + U8 c = (U8)utf8n_to_uvchr(tmps, 0, &l, UTF8_ALLOW_ANY); tmps += UTF8SKIP(tmps); *result++ = ~c; } @@ -2934,7 +2934,7 @@ PP(pp_ord) STRLEN len; U8 *s = (U8*)SvPVx(argsv, len); - XPUSHu(DO_UTF8(argsv) ? utf8_to_uv_simple(s, 0) : (*s & 0xff)); + XPUSHu(DO_UTF8(argsv) ? utf8_to_uvchr(s, 0) : (*s & 0xff)); RETURN; } @@ -2948,7 +2948,7 @@ PP(pp_chr) if (value > 255 && !IN_BYTE) { SvGROW(TARG, UNISKIP(value)+1); - tmps = (char*)uv_to_utf8((U8*)SvPVX(TARG), value); + tmps = (char*)uvchr_to_utf8((U8*)SvPVX(TARG), value); SvCUR_set(TARG, tmps - SvPVX(TARG)); *tmps = '\0'; (void)SvPOK_only(TARG); @@ -2997,17 +2997,17 @@ PP(pp_ucfirst) STRLEN ulen; U8 tmpbuf[UTF8_MAXLEN+1]; U8 *tend; - UV uv = utf8_to_uv(s, slen, &ulen, 0); + UV uv; if (PL_op->op_private & OPpLOCALE) { TAINT; SvTAINTED_on(sv); - uv = toTITLE_LC_uni(uv); + uv = toTITLE_LC_uvchr(utf8n_to_uvchr(s, slen, &ulen, 0)); } else uv = toTITLE_utf8(s); - tend = uv_to_utf8(tmpbuf, uv); + tend = uvchr_to_utf8(tmpbuf, uv); if (!SvPADTMP(sv) || tend - tmpbuf != ulen || SvREADONLY(sv)) { dTARGET; @@ -3056,17 +3056,17 @@ PP(pp_lcfirst) STRLEN ulen; U8 tmpbuf[UTF8_MAXLEN+1]; U8 *tend; - UV uv = utf8_to_uv(s, slen, &ulen, 0); + UV uv; if (PL_op->op_private & OPpLOCALE) { TAINT; SvTAINTED_on(sv); - uv = toLOWER_LC_uni(uv); + uv = toLOWER_LC_uvchr(utf8n_to_uvchr(s, slen, &ulen, 0)); } else uv = toLOWER_utf8(s); - tend = uv_to_utf8(tmpbuf, uv); + tend = uvchr_to_utf8(tmpbuf, uv); if (!SvPADTMP(sv) || tend - tmpbuf != ulen || SvREADONLY(sv)) { dTARGET; @@ -3133,13 +3133,13 @@ PP(pp_uc) TAINT; SvTAINTED_on(TARG); while (s < send) { - d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv(s, len, &ulen, 0))); + d = uvchr_to_utf8(d, toUPPER_LC_uvchr( utf8n_to_uvchr(s, len, &ulen, 0))); s += ulen; } } else { while (s < send) { - d = uv_to_utf8(d, toUPPER_utf8( s )); + d = uvchr_to_utf8(d, toUPPER_utf8( s )); s += UTF8SKIP(s); } } @@ -3207,13 +3207,13 @@ PP(pp_lc) TAINT; SvTAINTED_on(TARG); while (s < send) { - d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv(s, len, &ulen, 0))); + d = uvchr_to_utf8(d, toLOWER_LC_uvchr( utf8n_to_uvchr(s, len, &ulen, 0))); s += ulen; } } else { while (s < send) { - d = uv_to_utf8(d, toLOWER_utf8(s)); + d = uvchr_to_utf8(d, toLOWER_utf8(s)); s += UTF8SKIP(s); } } @@ -3967,7 +3967,7 @@ PP(pp_reverse) continue; } else { - if (!utf8_to_uv_simple(s, 0)) + if (!utf8_to_uvchr(s, 0)) break; up = (char*)s; s += UTF8SKIP(s); @@ -4046,7 +4046,14 @@ PP(pp_unpack) STRLEN llen; STRLEN rlen; register char *pat = SvPV(left, llen); +#if 0 + /* Packed side is assumed to be octets - so force downgrade if it + has been UTF-8 encoded by accident + */ + register char *s = SvPVbyte(right, rlen); +#else register char *s = SvPV(right, rlen); +#endif char *strend = s + rlen; char *strbeg = s; register char *patend = pat + llen; @@ -4355,7 +4362,7 @@ PP(pp_unpack) if (checksum) { while (len-- > 0 && s < strend) { STRLEN alen; - auint = utf8_to_uv((U8*)s, strend - s, &alen, 0); + auint = utf8n_to_uvchr((U8*)s, strend - s, &alen, 0); along = alen; s += along; if (checksum > 32) @@ -4369,7 +4376,7 @@ PP(pp_unpack) EXTEND_MORTAL(len); while (len-- > 0 && s < strend) { STRLEN alen; - auint = utf8_to_uv((U8*)s, strend - s, &alen, 0); + auint = utf8n_to_uvchr((U8*)s, strend - s, &alen, 0); along = alen; s += along; sv = NEWSV(37, 0); @@ -5407,7 +5414,7 @@ PP(pp_pack) fromstr = NEXTFROM; auint = SvUV(fromstr); SvGROW(cat, SvCUR(cat) + UTF8_MAXLEN + 1); - SvCUR_set(cat, (char*)uv_to_utf8((U8*)SvEND(cat),auint) + SvCUR_set(cat, (char*)uvchr_to_utf8((U8*)SvEND(cat),auint) - SvPVX(cat)); } *SvEND(cat) = '\0'; @@ -164,7 +164,7 @@ PP(pp_substcont) register char *m = cx->sb_m; char *orig = cx->sb_orig; register REGEXP *rx = cx->sb_rx; - + rxres_restore(&cx->sb_rxres, rx); if (cx->sb_iters++) { @@ -3049,13 +3049,13 @@ PP(pp_require) U8 *s = (U8*)SvPVX(sv); U8 *end = (U8*)SvPVX(sv) + SvCUR(sv); if (s < end) { - rev = utf8_to_uv(s, end - s, &len, 0); + rev = utf8n_to_uvchr(s, end - s, &len, 0); s += len; if (s < end) { - ver = utf8_to_uv(s, end - s, &len, 0); + ver = utf8n_to_uvchr(s, end - s, &len, 0); s += len; if (s < end) - sver = utf8_to_uv(s, end - s, &len, 0); + sver = utf8n_to_uvchr(s, end - s, &len, 0); } } if (PERL_REVISION < rev @@ -821,9 +821,12 @@ PERL_CALLCONV U8* Perl_utf8_hop(pTHX_ U8 *s, I32 off); PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len); PERL_CALLCONV U8* Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8); PERL_CALLCONV U8* Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len); -PERL_CALLCONV UV Perl_utf8_to_uv_simple(pTHX_ U8 *s, STRLEN* retlen); -PERL_CALLCONV UV Perl_utf8_to_uv(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags); -PERL_CALLCONV U8* Perl_uv_to_utf8(pTHX_ U8 *d, UV uv); +PERL_CALLCONV UV Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN* retlen); +PERL_CALLCONV UV Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN* retlen); +PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags); +PERL_CALLCONV UV Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags); +PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv); +PERL_CALLCONV U8* Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv); PERL_CALLCONV void Perl_vivify_defelem(pTHX_ SV* sv); PERL_CALLCONV void Perl_vivify_ref(pTHX_ SV* sv, U32 to_what); PERL_CALLCONV I32 Perl_wait4pid(pTHX_ Pid_t pid, int* statusp, int flags); @@ -807,7 +807,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg if (UTF) { U8 *s = (U8*)STRING(scan); l = utf8_length(s, s + l); - uc = utf8_to_uv_simple(s, NULL); + uc = utf8_to_uvchr(s, NULL); } min += l; if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ @@ -862,7 +862,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg if (UTF) { U8 *s = (U8 *)STRING(scan); l = utf8_length(s, s + l); - uc = utf8_to_uv_simple(s, NULL); + uc = utf8_to_uvchr(s, NULL); } min += l; if (data && (flags & SCF_DO_SUBSTR)) @@ -2928,7 +2928,7 @@ tryagain: default: normal_default: if (UTF8_IS_START(*p) && UTF) { - ender = utf8_to_uv((U8*)p, RExC_end - p, + ender = utf8n_to_uvuni((U8*)p, RExC_end - p, &numlen, 0); p += numlen; } @@ -2940,7 +2940,7 @@ tryagain: p = regwhite(p, RExC_end); if (UTF && FOLD) { if (LOC) - ender = toLOWER_LC_uni(ender); + ender = toLOWER_LC_uvchr(UNI_TO_NATIVE(ender)); else ender = toLOWER_uni(ender); } @@ -3227,7 +3227,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!range) rangebegin = RExC_parse; if (UTF) { - value = utf8_to_uv((U8*)RExC_parse, + value = utf8n_to_uvuni((U8*)RExC_parse, RExC_end - RExC_parse, &numlen, 0); RExC_parse += numlen; @@ -3238,7 +3238,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) namedclass = regpposixcc(pRExC_state, value); else if (value == '\\') { if (UTF) { - value = utf8_to_uv((U8*)RExC_parse, + value = utf8n_to_uvuni((U8*)RExC_parse, RExC_end - RExC_parse, &numlen, 0); RExC_parse += numlen; @@ -3914,7 +3914,7 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) STATIC void S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp) { - *lenp = SIZE_ONLY ? UNISKIP(uv) : (uv_to_utf8((U8*)s, uv) - (U8*)s); + *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvuni_to_utf8((U8*)s, uv) - (U8*)s); } /* @@ -4284,7 +4284,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) U8 s[UTF8_MAXLEN+1]; for (i = 0; i <= 256; i++) { /* just the first 256 */ - U8 *e = uv_to_utf8(s, i); + U8 *e = uvuni_to_utf8(s, i); if (i < 256 && swash_fetch(sw, s)) { if (rangestart == -1) @@ -4294,14 +4294,14 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) if (i <= rangestart + 3) for (; rangestart < i; rangestart++) { - for(e = uv_to_utf8(s, rangestart), p = s; p < e; p++) + for(e = uvuni_to_utf8(s, rangestart), p = s; p < e; p++) put_byte(sv, *p); } else { - for (e = uv_to_utf8(s, rangestart), p = s; p < e; p++) + for (e = uvuni_to_utf8(s, rangestart), p = s; p < e; p++) put_byte(sv, *p); sv_catpv(sv, "-"); - for (e = uv_to_utf8(s, i - 1), p = s; p < e; p++) + for (e = uvuni_to_utf8(s, i - 1), p = s; p < e; p++) put_byte(sv, *p); } rangestart = -1; @@ -38,11 +38,11 @@ # define Perl_re_intuit_start my_re_intuit_start /* *These* symbols are masked to allow static link. */ # define Perl_pregexec my_pregexec -# define Perl_reginitcolors my_reginitcolors +# define Perl_reginitcolors my_reginitcolors # define Perl_regclass_swash my_regclass_swash # define PERL_NO_GET_CONTEXT -#endif +#endif /*SUPPRESS 112*/ /* @@ -194,9 +194,9 @@ S_regcppop(pTHX) DEBUG_r( PerlIO_printf(Perl_debug_log, " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n", - (UV)paren, (IV)PL_regstartp[paren], + (UV)paren, (IV)PL_regstartp[paren], (IV)(PL_reg_start_tmp[paren] - PL_bostr), - (IV)PL_regendp[paren], + (IV)PL_regendp[paren], (paren > *PL_reglastparen ? "(no)" : "")); ); } @@ -281,7 +281,7 @@ Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *stren /* nosave: For optimizations. */ { return - regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL, + regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL, nosave ? 0 : REXEC_COPY_STR); } @@ -293,11 +293,11 @@ S_cache_re(pTHX_ regexp *prog) PL_regprogram = prog->program; #endif PL_regnpar = prog->nparens; - PL_regdata = prog->data; - PL_reg_re = prog; + PL_regdata = prog->data; + PL_reg_re = prog; } -/* +/* * Need to implement the following flags for reg_anch: * * USE_INTUIT_NOML - Useful to call re_intuit_start() first @@ -407,7 +407,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, if (SvTAIL(check)) { slen = SvCUR(check); /* >= 1 */ - if ( strend - s > slen || strend - s < slen - 1 + if ( strend - s > slen || strend - s < slen - 1 || (strend - s == slen && strend[-1] != '\n')) { DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n")); goto fail_finish; @@ -468,7 +468,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, || ( BmRARE(check) == '\n' && (BmPREVIOUS(check) == SvCUR(check) - 1) && SvTAIL(check) )) - s = screaminstr(sv, check, + s = screaminstr(sv, check, start_shift + (s - strbeg), end_shift, pp, 0); else goto fail_finish; @@ -657,7 +657,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, Thus we can arrive here only if check substr is float. Redo checking for "other"=="fixed". */ - strpos = t + 1; + strpos = t + 1; DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n", PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset))); goto do_other_anchored; @@ -798,7 +798,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, if (t + start_shift >= check_at) /* Contradicts floating=check */ goto retry_floating_check; /* Recheck anchored substring, but not floating... */ - s = check_at; + s = check_at; if (!check) goto giveup; DEBUG_r( PerlIO_printf(Perl_debug_log, @@ -826,11 +826,11 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, goto hop_and_restart; } DEBUG_r( if (t != s) - PerlIO_printf(Perl_debug_log, + PerlIO_printf(Perl_debug_log, "By STCLASS: moving %ld --> %ld\n", (long)(t - i_strpos), (long)(s - i_strpos)); else - PerlIO_printf(Perl_debug_log, + PerlIO_printf(Perl_debug_log, "Does not contradict STCLASS...\n") ); } giveup: @@ -903,14 +903,14 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta STRLEN len; if (c1 == c2) while (s <= e) { - if ( utf8_to_uv_simple((U8*)s, &len) == c1 + if ( utf8_to_uvchr((U8*)s, &len) == c1 && regtry(prog, s) ) goto got_it; s += len; } else while (s <= e) { - UV c = utf8_to_uv_simple((U8*)s, &len); + UV c = utf8_to_uvchr((U8*)s, &len); if ( (c == c1 || c == c2) && regtry(prog, s) ) goto got_it; s += len; @@ -948,11 +948,11 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta tmp = '\n'; else { U8 *r = reghop3((U8*)s, -1, (U8*)startpos); - - tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0); + + tmp = (I32)utf8n_to_uvuni(r, s - (char*)r, 0, 0); } tmp = ((OP(c) == BOUND ? - isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0); + isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); while (s < strend) { if (tmp == !(OP(c) == BOUND ? swash_fetch(PL_utf8_alnum, (U8*)s) : @@ -990,11 +990,11 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta tmp = '\n'; else { U8 *r = reghop3((U8*)s, -1, (U8*)startpos); - - tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0); + + tmp = (I32)utf8n_to_uvuni(r, s - (char*)r, 0, 0); } tmp = ((OP(c) == NBOUND ? - isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0); + isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); while (s < strend) { if (tmp == !(OP(c) == NBOUND ? swash_fetch(PL_utf8_alnum, (U8*)s) : @@ -1424,7 +1424,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * else { if (prog->reganch & ROPT_UTF8 && do_utf8) { U8 *s = reghop3((U8*)stringarg, -1, (U8*)strbeg); - PL_regprev = utf8_to_uv(s, (U8*)stringarg - s, NULL, 0); + PL_regprev = utf8n_to_uvuni(s, (U8*)stringarg - s, NULL, 0); } else PL_regprev = (U32)stringarg[-1]; @@ -1554,7 +1554,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * } /* Messy cases: unanchored match. */ - if (prog->anchored_substr && prog->reganch & ROPT_SKIP) { + if (prog->anchored_substr && prog->reganch & ROPT_SKIP) { /* we have /x+whatever/ */ /* it must be a one character string (XXXX Except UTF?) */ char ch = SvPVX(prog->anchored_substr)[0]; @@ -1593,13 +1593,13 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * /*SUPPRESS 560*/ else if (do_utf8 == (UTF!=0) && (prog->anchored_substr != Nullsv - || (prog->float_substr != Nullsv + || (prog->float_substr != Nullsv && prog->float_max_offset < strend - s))) { - SV *must = prog->anchored_substr + SV *must = prog->anchored_substr ? prog->anchored_substr : prog->float_substr; - I32 back_max = + I32 back_max = prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset; - I32 back_min = + I32 back_min = prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset; char *last = HOP3c(strend, /* Cannot start after this */ -(I32)(CHR_SVLEN(must) @@ -1620,11 +1620,11 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * dontbother = end_shift; strend = HOPc(strend, -dontbother); while ( (s <= last) && - ((flags & REXEC_SCREAM) + ((flags & REXEC_SCREAM) ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg, end_shift, &scream_pos, 0)) : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend), - (unsigned char*)strend, must, + (unsigned char*)strend, must, PL_multiline ? FBMrf_MULTILINE : 0))) ) { DEBUG_r( did_match = 1 ); if (HOPc(s, -back_max) > last1) { @@ -1694,13 +1694,13 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * if (memEQ(strend - len + 1, little, len - 1)) last = strend - len + 1; else if (!PL_multiline) - last = memEQ(strend - len, little, len) + last = memEQ(strend - len, little, len) ? strend - len : Nullch; else goto find_last; } else { find_last: - if (len) + if (len) last = rninstr(s, strend, little, little + len); else last = strend; /* matching `$' */ @@ -1769,7 +1769,7 @@ got_it: prog->sublen = PL_regeol - strbeg; /* strend may have been modified */ } } - + return 1; phooey: @@ -1818,7 +1818,7 @@ S_regtry(pTHX_ regexp *prog, char *startpos) DEFSV = PL_reg_sv; } - if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv) + if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv) && (mg = mg_find(PL_reg_sv, 'g')))) { /* prepare for quick setting of pos */ sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0); @@ -1996,7 +1996,7 @@ S_regmatch(pTHX_ regnode *prog) after the current position the third one. We assume that pref0_len <= pref_len, otherwise we decrease pref0_len. */ - int pref_len = (locinput - PL_bostr) > (5 + taill) - l + int pref_len = (locinput - PL_bostr) > (5 + taill) - l ? (5 + taill) - l : locinput - PL_bostr; int pref0_len; @@ -2004,7 +2004,7 @@ S_regmatch(pTHX_ regnode *prog) pref_len++; pref0_len = pref_len - (locinput - PL_reg_starttry); if (l + pref_len < (5 + taill) && l < PL_regeol - locinput) - l = ( PL_regeol - locinput > (5 + taill) - pref_len + l = ( PL_regeol - locinput > (5 + taill) - pref_len ? (5 + taill) - pref_len : PL_regeol - locinput); while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) l--; @@ -2013,12 +2013,12 @@ S_regmatch(pTHX_ regnode *prog) if (pref0_len > pref_len) pref0_len = pref_len; regprop(prop, scan); - PerlIO_printf(Perl_debug_log, + PerlIO_printf(Perl_debug_log, "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n", - (IV)(locinput - PL_bostr), - PL_colors[4], pref0_len, + (IV)(locinput - PL_bostr), + PL_colors[4], pref0_len, locinput - pref_len, PL_colors[5], - PL_colors[2], pref_len - pref0_len, + PL_colors[2], pref_len - pref0_len, locinput - pref_len + pref0_len, PL_colors[3], (docolor ? "" : "> <"), PL_colors[0], l, locinput, PL_colors[1], @@ -2036,7 +2036,7 @@ S_regmatch(pTHX_ regnode *prog) case BOL: if (locinput == PL_bostr ? PL_regprev == '\n' - : (PL_multiline && + : (PL_multiline && (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') ) { /* regtill = regbol; */ @@ -2108,7 +2108,7 @@ S_regmatch(pTHX_ regnode *prog) while (s < e) { if (l >= PL_regeol) sayNO; - if (*((U8*)s) != utf8_to_uv_simple((U8*)l, &len)) + if (*((U8*)s) != utf8_to_uvchr((U8*)l, &len)) sayNO; s++; l += len; @@ -2117,7 +2117,7 @@ S_regmatch(pTHX_ regnode *prog) while (s < e) { if (l >= PL_regeol) sayNO; - if (*((U8*)l) != utf8_to_uv_simple((U8*)s, &len)) + if (*((U8*)l) != utf8_to_uvchr((U8*)s, &len)) sayNO; s += len; l++; @@ -2152,7 +2152,7 @@ S_regmatch(pTHX_ regnode *prog) if (l >= PL_regeol) { sayNO; } - if ((UTF ? utf8_to_uv((U8*)s, e - s, 0, 0) : *((U8*)s)) != + if ((UTF ? utf8n_to_uvuni((U8*)s, e - s, 0, 0) : *((U8*)s)) != (c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l))) sayNO; s += UTF ? UTF8SKIP(s) : 1; @@ -2252,15 +2252,15 @@ S_regmatch(pTHX_ regnode *prog) ln = PL_regprev; else { U8 *r = reghop((U8*)locinput, -1); - - ln = utf8_to_uv(r, s - (char*)r, 0, 0); + + ln = utf8n_to_uvuni(r, s - (char*)r, 0, 0); } if (OP(scan) == BOUND || OP(scan) == NBOUND) { ln = isALNUM_uni(ln); n = swash_fetch(PL_utf8_alnum, (U8*)locinput); } else { - ln = isALNUM_LC_uni(ln); + ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln)); n = isALNUM_LC_utf8((U8*)locinput); } } @@ -2464,7 +2464,7 @@ S_regmatch(pTHX_ regnode *prog) COP *ocurcop = PL_curcop; SV **ocurpad = PL_curpad; SV *ret; - + n = ARG(scan); PL_op = (OP_4tree*)PL_regdata->data[n]; DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) ); @@ -2475,7 +2475,7 @@ S_regmatch(pTHX_ regnode *prog) SPAGAIN; ret = POPs; PUTBACK; - + PL_op = oop; PL_curpad = ocurpad; PL_curcop = ocurcop; @@ -2506,7 +2506,7 @@ S_regmatch(pTHX_ regnode *prog) pm.op_pmflags = 0; re = CALLREGCOMP(aTHX_ t, t + len, &pm); - if (!(SvFLAGS(ret) + if (!(SvFLAGS(ret) & (SVs_TEMP | SVs_PADTMP | SVf_READONLY))) sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0); PL_regprecomp = oprecomp; @@ -2514,7 +2514,7 @@ S_regmatch(pTHX_ regnode *prog) PL_regnpar = onpar; } DEBUG_r( - PerlIO_printf(Perl_debug_log, + PerlIO_printf(Perl_debug_log, "Entering embedded `%s%.60s%s%s'\n", PL_colors[0], re->precomp, @@ -2527,7 +2527,7 @@ S_regmatch(pTHX_ regnode *prog) state.re = PL_reg_re; PL_regcc = 0; - + cp = regcppush(0); /* Save *all* the positions. */ REGCP_SET(lastcp); cache_re(re); @@ -2616,7 +2616,7 @@ S_regmatch(pTHX_ regnode *prog) 1) After matching X, regnode for CURLYX is processed; - 2) This regnode creates infoblock on the stack, and calls + 2) This regnode creates infoblock on the stack, and calls regmatch() recursively with the starting point at WHILEM node; 3) Each hit of WHILEM node tries to match A and Z (in the order @@ -2637,7 +2637,7 @@ S_regmatch(pTHX_ regnode *prog) and whatever it mentions via ->next, and additional attached trees corresponding to temporarily unset infoblocks as in "5" above. - In the following picture infoblocks for outer loop of + In the following picture infoblocks for outer loop of (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block is denoted by x. The matched string is YAAZYAZT. Temporarily postponed infoblocks are drawn below the "reset" infoblock. @@ -2730,10 +2730,10 @@ S_regmatch(pTHX_ regnode *prog) PL_reginput = locinput; DEBUG_r( - PerlIO_printf(Perl_debug_log, - "%*s %ld out of %ld..%ld cc=%lx\n", + PerlIO_printf(Perl_debug_log, + "%*s %ld out of %ld..%ld cc=%lx\n", REPORT_CODE_OFF+PL_regindent*2, "", - (long)n, (long)cc->min, + (long)n, (long)cc->min, (long)cc->max, (long)cc) ); @@ -2832,7 +2832,7 @@ S_regmatch(pTHX_ regnode *prog) PL_regcc = cc; if (n >= cc->max) { /* Maximum greed exceeded? */ - if (ckWARN(WARN_REGEXP) && n >= REG_INFTY + if (ckWARN(WARN_REGEXP) && n >= REG_INFTY && !(PL_reg_flags & RF_warned)) { PL_reg_flags |= RF_warned; Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded", @@ -2884,7 +2884,7 @@ S_regmatch(pTHX_ regnode *prog) REPORT_CODE_OFF+PL_regindent*2, "") ); } - if (ckWARN(WARN_REGEXP) && n >= REG_INFTY + if (ckWARN(WARN_REGEXP) && n >= REG_INFTY && !(PL_reg_flags & RF_warned)) { PL_reg_flags |= RF_warned; Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded", @@ -2906,13 +2906,13 @@ S_regmatch(pTHX_ regnode *prog) sayNO; } /* NOT REACHED */ - case BRANCHJ: + case BRANCHJ: next = scan + ARG(scan); if (next == scan) next = NULL; inner = NEXTOPER(NEXTOPER(scan)); goto do_branch; - case BRANCH: + case BRANCH: inner = NEXTOPER(scan); do_branch: { @@ -2955,7 +2955,7 @@ S_regmatch(pTHX_ regnode *prog) { I32 l = 0; CHECKPOINT lastcp; - + /* We suppose that the next guy does not need backtracking: in particular, it is of constant length, and has no parenths to influence future backrefs. */ @@ -3133,7 +3133,7 @@ S_regmatch(pTHX_ regnode *prog) c2 = to_utf8_upper(s); } else { - c2 = c1 = utf8_to_uv_simple(s, NULL); + c2 = c1 = utf8_to_uvchr(s, NULL); } } } @@ -3176,7 +3176,7 @@ S_regmatch(pTHX_ regnode *prog) while (locinput <= e && *locinput != c1) locinput++; } else { - while (locinput <= e + while (locinput <= e && *locinput != c1 && *locinput != c2) locinput++; @@ -3188,20 +3188,20 @@ S_regmatch(pTHX_ regnode *prog) if (c1 == c2) { for (count = 0; locinput <= e && - utf8_to_uv_simple((U8*)locinput, &len) != c1; + utf8_to_uvchr((U8*)locinput, &len) != c1; count++) locinput += len; } else { for (count = 0; locinput <= e; count++) { - UV c = utf8_to_uv_simple((U8*)locinput, &len); + UV c = utf8_to_uvchr((U8*)locinput, &len); if (c == c1 || c == c2) break; - locinput += len; + locinput += len; } } } - if (locinput > e) + if (locinput > e) sayNO; /* PL_reginput == old now */ if (locinput != old) { @@ -3226,9 +3226,9 @@ S_regmatch(pTHX_ regnode *prog) UV c; if (c1 != -1000) { if (do_utf8) - c = utf8_to_uv_simple((U8*)PL_reginput, NULL); + c = utf8_to_uvchr((U8*)PL_reginput, NULL); else - c = UCHARAT(PL_reginput); + c = UCHARAT(PL_reginput); } /* If it could work, try it. */ if (c1 == -1000 || c == c1 || c == c2) @@ -3265,9 +3265,9 @@ S_regmatch(pTHX_ regnode *prog) while (n >= ln) { if (c1 != -1000) { if (do_utf8) - c = utf8_to_uv_simple((U8*)PL_reginput, NULL); + c = utf8_to_uvchr((U8*)PL_reginput, NULL); else - c = UCHARAT(PL_reginput); + c = UCHARAT(PL_reginput); } /* If it could work, try it. */ if (c1 == -1000 || c == c1 || c == c2) @@ -3285,9 +3285,9 @@ S_regmatch(pTHX_ regnode *prog) while (n >= ln) { if (c1 != -1000) { if (do_utf8) - c = utf8_to_uv_simple((U8*)PL_reginput, NULL); + c = utf8_to_uvchr((U8*)PL_reginput, NULL); else - c = UCHARAT(PL_reginput); + c = UCHARAT(PL_reginput); } /* If it could work, try it. */ if (c1 == -1000 || c == c1 || c == c2) @@ -3355,7 +3355,7 @@ S_regmatch(pTHX_ regnode *prog) case SUSPEND: n = 1; PL_reginput = locinput; - goto do_ifmatch; + goto do_ifmatch; case UNLESSM: n = 0; if (scan->flags) { @@ -3368,7 +3368,7 @@ S_regmatch(pTHX_ regnode *prog) PL_reginput = s; } else { - if (locinput < PL_bostr + scan->flags) + if (locinput < PL_bostr + scan->flags) goto say_yes; PL_reginput = locinput - scan->flags; goto do_ifmatch; @@ -3389,7 +3389,7 @@ S_regmatch(pTHX_ regnode *prog) PL_reginput = s; } else { - if (locinput < PL_bostr + scan->flags) + if (locinput < PL_bostr + scan->flags) goto say_no; PL_reginput = locinput - scan->flags; goto do_ifmatch; @@ -3482,14 +3482,14 @@ do_no: { re_unwind_branch_t *uwb = &(uw->branch); I32 lastparen = uwb->lastparen; - + REGCP_UNWIND(uwb->lastcp); for (n = *PL_reglastparen; n > lastparen; n--) PL_regendp[n] = -1; *PL_reglastparen = n; scan = next = uwb->next; - if ( !scan || - OP(scan) != (uwb->type == RE_UNWIND_BRANCH + if ( !scan || + OP(scan) != (uwb->type == RE_UNWIND_BRANCH ? BRANCH : BRANCHJ) ) { /* Failure */ unwind = uwb->prev; #ifdef DEBUGGING @@ -3739,22 +3739,22 @@ S_regrepeat(pTHX_ regnode *p, I32 max) c = scan - PL_reginput; PL_reginput = scan; - DEBUG_r( + DEBUG_r( { SV *prop = sv_newmortal(); regprop(prop, p); - PerlIO_printf(Perl_debug_log, - "%*s %s can match %"IVdf" times out of %"IVdf"...\n", + PerlIO_printf(Perl_debug_log, + "%*s %s can match %"IVdf" times out of %"IVdf"...\n", REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max); }); - + return(c); } /* - regrepeat_hard - repeatedly match something, report total lenth and length - * + * * The repeater is supposed to have constant length. */ @@ -3800,7 +3800,7 @@ S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp) } if (!res) PL_reginput = scan; - + return count; } @@ -3821,10 +3821,10 @@ Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp) SV *rv = (SV*)PL_regdata->data[n]; AV *av = (AV*)SvRV((SV*)rv); SV **a; - + si = *av_fetch(av, 0, FALSE); a = av_fetch(av, 1, FALSE); - + if (a) sw = *a; else if (si && doinit) { @@ -3853,7 +3853,7 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8) STRLEN len; if (do_utf8) - c = utf8_to_uv_simple(p, &len); + c = utf8_to_uvchr(p, &len); else c = *p; @@ -3872,13 +3872,13 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8) match = TRUE; else if (flags & ANYOF_FOLD) { U8 tmpbuf[UTF8_MAXLEN+1]; - + if (flags & ANYOF_LOCALE) { PL_reg_flags |= RF_tainted; - uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p)); + uvchr_to_utf8(tmpbuf, toLOWER_LC_utf8(p)); } else - uv_to_utf8(tmpbuf, toLOWER_utf8(p)); + uvchr_to_utf8(tmpbuf, toLOWER_utf8(p)); if (swash_fetch(sw, tmpbuf)) match = TRUE; } @@ -3946,13 +3946,13 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8) STATIC U8 * S_reghop(pTHX_ U8 *s, I32 off) -{ +{ return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)); } STATIC U8 * S_reghop3(pTHX_ U8 *s, I32 off, U8* lim) -{ +{ if (off >= 0) { while (off-- && s < lim) { /* XXX could check well-formedness here */ @@ -3976,7 +3976,7 @@ S_reghop3(pTHX_ U8 *s, I32 off, U8* lim) STATIC U8 * S_reghopmaybe(pTHX_ U8 *s, I32 off) -{ +{ return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)); } @@ -4705,8 +4705,8 @@ Perl_sv_pos_b2u(pTHX_ register SV *sv, I32* offsetp) len = 0; while (s < send) { STRLEN n; - - if (utf8_to_uv(s, UTF8SKIP(s), &n, 0)) { + /* We can use low level directly here as we are not looking at the values */ + if (utf8n_to_uvuni(s, UTF8SKIP(s), &n, 0)) { s += n; len++; } @@ -7099,7 +7099,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV uv = args ? va_arg(*args, int) : SvIVx(argsv); if ((uv > 255 || (uv > 127 && SvUTF8(sv))) && !IN_BYTE) { eptr = (char*)utf8buf; - elen = uv_to_utf8((U8*)eptr, uv) - utf8buf; + elen = uvchr_to_utf8((U8*)eptr, uv) - utf8buf; is_utf = TRUE; } else { @@ -7183,13 +7183,11 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV if (!veclen) continue; if (vec_utf) - iv = (IV)utf8_to_uv(vecstr, veclen, &ulen, 0); + iv = (IV)utf8n_to_uvchr(vecstr, veclen, &ulen, 0); else { iv = *vecstr; ulen = 1; } - if (iv <256) - iv = NATIVE_TO_ASCII(iv); /* v-strings are codepoints */ vecstr += ulen; veclen -= ulen; } @@ -7265,13 +7263,11 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV if (!veclen) continue; if (vec_utf) - uv = utf8_to_uv(vecstr, veclen, &ulen, 0); + uv = utf8n_to_uvchr(vecstr, veclen, &ulen, 0); else { uv = *vecstr; ulen = 1; } - if (uv <256) - uv = NATIVE_TO_ASCII(uv); /* v-strings are codepoints */ vecstr += ulen; veclen -= ulen; } @@ -179,7 +179,7 @@ int yyactlevel = -1; STATIC void S_tokereport(pTHX_ char *thing, char* s, I32 rv) -{ +{ SV *report; DEBUG_T({ report = newSVpv(thing, 0); @@ -838,7 +838,7 @@ Perl_str_to_version(pTHX_ SV *sv) STRLEN skip; UV n; if (utf) - n = utf8_to_uv((U8*)start, len, &skip, 0); + n = utf8n_to_uvchr((U8*)start, len, &skip, 0); else { n = *(U8*)start; skip = 1; @@ -1475,7 +1475,7 @@ S_scan_const(pTHX_ char *start) if (hicount) { char *old_pvx = SvPVX(sv); char *src, *dst; - + d = SvGROW(sv, SvLEN(sv) + hicount + 1) + (d - old_pvx); @@ -1497,7 +1497,7 @@ S_scan_const(pTHX_ char *start) } if (has_utf8 || uv > 255) { - d = (char*)uv_to_utf8((U8*)d, uv); + d = (char*)uvchr_to_utf8((U8*)d, uv); has_utf8 = TRUE; if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) { @@ -1622,14 +1622,14 @@ S_scan_const(pTHX_ char *start) STRLEN len = (STRLEN) -1; UV uv; if (this_utf8) { - uv = utf8_to_uv((U8*)s, send - s, &len, 0); + uv = utf8n_to_uvchr((U8*)s, send - s, &len, 0); } if (len == (STRLEN)-1) { /* Illegal UTF8 (a high-bit byte), make it valid. */ char *old_pvx = SvPVX(sv); /* need space for one extra char (NOTE: SvCUR() not set here) */ d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx); - d = (char*)uv_to_utf8((U8*)d, (U8)*s++); + d = (char*)uvchr_to_utf8((U8*)d, (U8)*s++); } else { while (len--) @@ -7273,11 +7273,8 @@ vstring: "Integer overflow in decimal number"); } } - /* THIS IS EVIL */ - if (rev < 256) - rev = ASCII_TO_NATIVE(rev); - - tmpend = uv_to_utf8(tmpbuf, rev); + /* Append native character for the rev point */ + tmpend = uvchr_to_utf8(tmpbuf, rev); if (rev > revmax) revmax = rev; sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf); @@ -27,14 +27,14 @@ /* Unicode support */ /* -=for apidoc A|U8*|uv_to_utf8|U8 *d|UV uv +=for apidoc A|U8*|uvuni_to_utf8|U8 *d|UV uv Adds the UTF8 representation of the Unicode codepoint C<uv> to the end of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free bytes available. The return value is the pointer to the byte after the -end of the new character. In other words, +end of the new character. In other words, - d = uv_to_utf8(d, uv); + d = uvuni_to_utf8(d, uv); is the recommended Unicode-aware way of saying @@ -44,10 +44,8 @@ is the recommended Unicode-aware way of saying */ U8 * -Perl_uv_to_utf8(pTHX_ U8 *d, UV uv) +Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv) { - if (uv < 0x100) - uv = NATIVE_TO_ASCII(uv); if (uv < 0x80) { *d++ = uv; return d; @@ -121,13 +119,39 @@ Perl_uv_to_utf8(pTHX_ U8 *d, UV uv) } /* +=for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv + +Adds the UTF8 representation of the Native codepoint C<uv> to the end +of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free +bytes available. The return value is the pointer to the byte after the +end of the new character. In other words, + + d = uvchr_to_utf8(d, uv); + +is the recommended wide native character-aware way of saying + + *(d++) = uv; + +=cut +*/ + +U8 * +Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv) +{ + if (uv < 0x100) + uv = NATIVE_TO_ASCII(uv); + return Perl_uvuni_to_utf8(aTHX_ d, uv); +} + + +/* =for apidoc A|STRLEN|is_utf8_char|U8 *s Tests if some arbitrary number of bytes begins in a valid UTF-8 character. Note that an ASCII character is a valid UTF-8 character. The actual number of bytes in the UTF-8 character will be returned if it is valid, otherwise 0. - + =cut */ STRLEN Perl_is_utf8_char(pTHX_ U8 *s) @@ -202,9 +226,10 @@ Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len) } /* -=for apidoc A|UV|utf8_to_uv|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags +=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags -Returns the character value of the first character in the string C<s> +Bottom level UTF-8 decode routine. +Returns the unicode code point value of the first character in the string C<s> which is assumed to be in UTF8 encoding and no longer than C<curlen>; C<retlen> will be set to the length, in bytes, of that character. @@ -219,10 +244,12 @@ length of the UTF-8 character in bytes, and zero will be returned. The C<flags> can also contain various flags to allow deviations from the strict UTF-8 encoding (see F<utf8.h>). +Most code should use utf8_to_uvchr() rather than call this directly. + =cut */ UV -Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) +Perl_utf8n_to_uvuni(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) { UV uv = *s, ouv; STRLEN len = 1; @@ -256,7 +283,7 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) if (UTF8_IS_ASCII(uv)) { if (retlen) *retlen = 1; - return ASCII_TO_NATIVE(*s); + return (UV) (*s); } if (UTF8_IS_CONTINUATION(uv) && @@ -270,7 +297,7 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) warning = UTF8_WARN_NON_CONTINUATION; goto malformed; } - + if ((uv == 0xfe || uv == 0xff) && !(flags & UTF8_ALLOW_FE_FF)) { warning = UTF8_WARN_FE_FF; @@ -287,7 +314,7 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) if (retlen) *retlen = len; - + expectlen = len; if ((curlen < expectlen) && @@ -417,12 +444,55 @@ malformed: } /* -=for apidoc A|U8* s|utf8_to_uv_simple|STRLEN *retlen +=for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags -Returns the character value of the first character in the string C<s> +Returns the native character value of the first character in the string C<s> which is assumed to be in UTF8 encoding; C<retlen> will be set to the length, in bytes, of that character. +Allows length and flags to be passed to low level routine. + +=cut +*/ + +UV +Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) +{ + UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags); + if (uv < 0x100) + return (UV) ASCII_TO_NATIVE(uv); + return uv; +} + +/* +=for apidoc A|U8* s|utf8_to_uvchr|STRLEN *retlen + +Returns the native character value of the first character in the string C<s> +which is assumed to be in UTF8 encoding; C<retlen> will be set to the +length, in bytes, of that character. + +If C<s> does not point to a well-formed UTF8 character, zero is +returned and retlen is set, if possible, to -1. + +=cut +*/ + +UV +Perl_utf8_to_uvchr(pTHX_ U8* s, STRLEN* retlen) +{ + return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0); +} + +/* +=for apidoc A|U8* s|utf8_to_uvuni|STRLEN *retlen + +Returns the Unicode code point of the first character in the string C<s> +which is assumed to be in UTF8 encoding; C<retlen> will be set to the +length, in bytes, of that character. + +This function should only be used when returned UV is considered +an index into the Unicode semantic tables (e.g. swashes). + If C<s> does not point to a well-formed UTF8 character, zero is returned and retlen is set, if possible, to -1. @@ -430,9 +500,10 @@ returned and retlen is set, if possible, to -1. */ UV -Perl_utf8_to_uv_simple(pTHX_ U8* s, STRLEN* retlen) +Perl_utf8_to_uvuni(pTHX_ U8* s, STRLEN* retlen) { - return Perl_utf8_to_uv(aTHX_ s, UTF8_MAXLEN, retlen, 0); + /* Call the low level routine asking for checks */ + return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0); } /* @@ -578,7 +649,7 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len) d = s = save; while (s < send) { STRLEN ulen; - *d++ = (U8)utf8_to_uv_simple(s, &ulen); + *d++ = (U8)utf8_to_uvchr(s, &ulen); s += ulen; } *d = '\0'; @@ -751,7 +822,7 @@ bool Perl_is_uni_alnum(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uv_to_utf8(tmpbuf, (UV)c); + uvuni_to_utf8(tmpbuf, (UV)c); return is_utf8_alnum(tmpbuf); } @@ -759,7 +830,7 @@ bool Perl_is_uni_alnumc(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uv_to_utf8(tmpbuf, (UV)c); + uvuni_to_utf8(tmpbuf, (UV)c); return is_utf8_alnumc(tmpbuf); } @@ -767,7 +838,7 @@ bool Perl_is_uni_idfirst(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uv_to_utf8(tmpbuf, (UV)c); + uvuni_to_utf8(tmpbuf, (UV)c); return is_utf8_idfirst(tmpbuf); } @@ -775,7 +846,7 @@ bool Perl_is_uni_alpha(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uv_to_utf8(tmpbuf, (UV)c); + uvuni_to_utf8(tmpbuf, (UV)c); return is_utf8_alpha(tmpbuf); } @@ -783,7 +854,7 @@ bool Perl_is_uni_ascii(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uv_to_utf8(tmpbuf, (UV)c); + uvuni_to_utf8(tmpbuf, (UV)c); return is_utf8_ascii(tmpbuf); } @@ -791,7 +862,7 @@ bool Perl_is_uni_space(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uv_to_utf8(tmpbuf, (UV)c); + uvuni_to_utf8(tmpbuf, (UV)c); return is_utf8_space(tmpbuf); } @@ -799,7 +870,7 @@ bool Perl_is_uni_digit(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uv_to_utf8(tmpbuf, (UV)c); + uvuni_to_utf8(tmpbuf, (UV)c); return is_utf8_digit(tmpbuf); } @@ -807,7 +878,7 @@ bool Perl_is_uni_upper(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uv_to_utf8(tmpbuf, (UV)c); + uvuni_to_utf8(tmpbuf, (UV)c); return is_utf8_upper(tmpbuf); } @@ -815,7 +886,7 @@ bool Perl_is_uni_lower(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uv_to_utf8(tmpbuf, (UV)c); + uvuni_to_utf8(tmpbuf, (UV)c); return is_utf8_lower(tmpbuf); } @@ -823,7 +894,7 @@ bool Perl_is_uni_cntrl(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uv_to_utf8(tmpbuf, (UV)c); + uvuni_to_utf8(tmpbuf, (UV)c); return is_utf8_cntrl(tmpbuf); } @@ -831,7 +902,7 @@ bool Perl_is_uni_graph(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uv_to_utf8(tmpbuf, (UV)c); + uvuni_to_utf8(tmpbuf, (UV)c); return is_utf8_graph(tmpbuf); } @@ -839,7 +910,7 @@ bool Perl_is_uni_print(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uv_to_utf8(tmpbuf, (UV)c); + uvuni_to_utf8(tmpbuf, (UV)c); return is_utf8_print(tmpbuf); } @@ -847,7 +918,7 @@ bool Perl_is_uni_punct(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uv_to_utf8(tmpbuf, (UV)c); + uvuni_to_utf8(tmpbuf, (UV)c); return is_utf8_punct(tmpbuf); } @@ -855,7 +926,7 @@ bool Perl_is_uni_xdigit(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uv_to_utf8(tmpbuf, (UV)c); + uvuni_to_utf8(tmpbuf, (UV)c); return is_utf8_xdigit(tmpbuf); } @@ -863,7 +934,7 @@ U32 Perl_to_uni_upper(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uv_to_utf8(tmpbuf, (UV)c); + uvuni_to_utf8(tmpbuf, (UV)c); return to_utf8_upper(tmpbuf); } @@ -871,7 +942,7 @@ U32 Perl_to_uni_title(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uv_to_utf8(tmpbuf, (UV)c); + uvuni_to_utf8(tmpbuf, (UV)c); return to_utf8_title(tmpbuf); } @@ -879,7 +950,7 @@ U32 Perl_to_uni_lower(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uv_to_utf8(tmpbuf, (UV)c); + uvuni_to_utf8(tmpbuf, (UV)c); return to_utf8_lower(tmpbuf); } @@ -1158,7 +1229,7 @@ Perl_to_utf8_upper(pTHX_ U8 *p) if (!PL_utf8_toupper) PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0); uv = swash_fetch(PL_utf8_toupper, p); - return uv ? uv : utf8_to_uv(p,UTF8_MAXLEN,0,0); + return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0); } UV @@ -1169,7 +1240,7 @@ Perl_to_utf8_title(pTHX_ U8 *p) if (!PL_utf8_totitle) PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0); uv = swash_fetch(PL_utf8_totitle, p); - return uv ? uv : utf8_to_uv(p,UTF8_MAXLEN,0,0); + return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0); } UV @@ -1180,7 +1251,7 @@ Perl_to_utf8_lower(pTHX_ U8 *p) if (!PL_utf8_tolower) PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0); uv = swash_fetch(PL_utf8_tolower, p); - return uv ? uv : utf8_to_uv(p,UTF8_MAXLEN,0,0); + return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0); } /* a "swash" is a swatch hash */ @@ -1274,7 +1345,10 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr) PUSHMARK(SP); EXTEND(SP,3); PUSHs((SV*)sv); - PUSHs(sv_2mortal(newSViv(utf8_to_uv(ptr, UTF8_MAXLEN, 0, 0) & ~(needents - 1)))); + /* We call utf8_to_uni as we want and index into Unicode tables, + not a native character number. + */ + PUSHs(sv_2mortal(newSViv(utf8_to_uvuni(ptr, 0) & ~(needents - 1)))); PUSHs(sv_2mortal(newSViv(needents))); PUTBACK; if (call_method("SWASHGET", G_SCALAR)) @@ -65,7 +65,7 @@ END_EXTERN_C #define UTF8_QUAD_MAX UINT64_C(0x1000000000) /* - + The following table is from Unicode 3.1. Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte @@ -100,7 +100,7 @@ END_EXTERN_C (uv) < 0x200000 ? 4 : \ (uv) < 0x4000000 ? 5 : \ (uv) < 0x80000000 ? 6 : \ - (uv) < UTF8_QUAD_MAX ? 7 : 13 ) + (uv) < UTF8_QUAD_MAX ? 7 : 13 ) #else /* No, I'm not even going to *TRY* putting #ifdef inside a #define */ #define UNISKIP(uv) ( (uv) < 0x80 ? 1 : \ @@ -132,21 +132,17 @@ END_EXTERN_C #define isIDFIRST_lazy(p) isIDFIRST_lazy_if(p,1) #define isALNUM_lazy(p) isALNUM_lazy_if(p,1) -/* EBCDIC-happy ways of converting native code to UTF8; the reverse - process is taken care of in utf8_to_uv */ +/* EBCDIC-happy ways of converting native code to UTF8 */ #ifdef EBCDIC #define NATIVE_TO_ASCII(ch) PL_e2a[(ch)] #define ASCII_TO_NATIVE(ch) PL_a2e[(ch)] +#define UNI_TO_NATIVE(ch) (((ch) > 0x100) ? (ch) : (UV) PL_a2e[(ch)]) +#define NATIVE_TO_UNI(ch) (((ch) > 0x100) ? (ch) : (UV) PL_e2a[(ch)]) #else #define NATIVE_TO_ASCII(ch) (ch) #define ASCII_TO_NATIVE(ch) (ch) +#define UNI_TO_NATIVE(ch) (ch) +#define NATIVE_TO_UNI(ch) (ch) #endif -#define UTF8_NEEDS_UPGRADE(ch) (NATIVE_TO_ASCII(ch) & 0x80) -#define NATIVE_TO_UTF8(ch, string) STMT_START { \ - if (!UTF8_NEEDS_UPGRADE(ch)) \ - *(string)++ = NATIVE_TO_ASCII(ch); \ - else /* uv_to_utf8 is EBCDIC-aware */ \ - string = uv_to_utf8(string, ch); \ - } STMT_END |