diff options
author | Nick Ing-Simmons <nik@tiuk.ti.com> | 2001-03-20 20:04:39 +0000 |
---|---|---|
committer | Nick Ing-Simmons <nik@tiuk.ti.com> | 2001-03-20 20:04:39 +0000 |
commit | 2b9d42f0ba1bb562fe21327dc7948ab1a5397a19 (patch) | |
tree | aee45626e3738deabafbe610cedef159d4c82d3b | |
parent | f2f6ab5ed2d2f824b4f6c3085a4a2275c2f8500a (diff) | |
download | perl-2b9d42f0ba1bb562fe21327dc7948ab1a5397a19.tar.gz |
More EBCDIC stuff:
- Loose the extra level of function on ASCII.
- spotted a chr(0) issue in sv.c
- re-work of UTF-X tr/// ranges to work in Unicode
space. Still issues with the "0xff is illegal UTF-8" hack.
- Yet another ad. hoc. utf8 'upgrade' in op.c recoded
(why do it once when you can do it all over the place :-(
- Enable HINTS_UTF8 on EBCDIC - then ignore it in toke.c,
need utf8.pm for swashes.
- Simplified and commented scan_const() in toke.c
Still something wrong regexp and tr (swashes?).
p4raw-id: //depot/perlio@9267
-rwxr-xr-x | embed.pl | 2 | ||||
-rw-r--r-- | lib/utf8.pm | 3 | ||||
-rw-r--r-- | op.c | 74 | ||||
-rw-r--r-- | regcomp.c | 16 | ||||
-rw-r--r-- | regexec.c | 10 | ||||
-rw-r--r-- | sv.c | 5 | ||||
-rw-r--r-- | toke.c | 92 | ||||
-rw-r--r-- | utf8.c | 137 | ||||
-rw-r--r-- | utf8.h | 4 | ||||
-rw-r--r-- | utfebcdic.h | 2 |
10 files changed, 188 insertions, 157 deletions
@@ -1260,6 +1260,8 @@ _EOE_ close(DOC); +unlink "pod/perlintern.pod"; + open(GUTS, ">pod/perlintern.pod") or die "Unable to create pod/perlintern.pod: $!\n"; print GUTS <<'END'; diff --git a/lib/utf8.pm b/lib/utf8.pm index 7c9a7dfb4e..f32c0bc5b7 100644 --- a/lib/utf8.pm +++ b/lib/utf8.pm @@ -1,6 +1,5 @@ package utf8; -if (ord('A') != 193) { # make things more pragmatic for EBCDIC folk $utf8::hint_bits = 0x00800000; @@ -21,8 +20,6 @@ sub AUTOLOAD { Carp::croak("Undefined subroutine $AUTOLOAD called"); } -} - 1; __END__ @@ -114,12 +114,12 @@ S_trlist_upgrade(pTHX_ U8** sp, U8** ep) *sp = d; while (s < e) { - if (*s < 0x80 || *s == 0xff) + if (NATIVE_IS_INVARIANT(*s) || NATIVE_TO_UTF(*s) == 0xff) *d++ = *s++; else { - U8 c = *s++; - *d++ = ((c >> 6) | 0xc0); - *d++ = ((c & 0x3f) | 0x80); + U8 c = NATIVE_TO_ASCII(*s++); + *d++ = UTF8_EIGHT_BIT_HI(c); + *d++ = UTF8_EIGHT_BIT_LO(c); } } *ep = d; @@ -2650,15 +2650,16 @@ Perl_newBINOP(pTHX_ I32 type, I32 flags, OP *first, OP *last) } static int -utf8compare(const void *a, const void *b) -{ - int i; - for (i = 0; i < 10; i++) { - if ((*(U8**)a)[i] < (*(U8**)b)[i]) - return -1; - if ((*(U8**)a)[i] > (*(U8**)b)[i]) - return 1; - } +uvcompare(const void *a, const void *b) +{ + if (*((UV *)a) < (*(UV *)b)) + return -1; + if (*((UV *)a) > (*(UV *)b)) + return 1; + if (*((UV *)a+1) < (*(UV *)b+1)) + return -1; + if (*((UV *)a+1) > (*(UV *)b+1)) + return 1; return 0; } @@ -2712,47 +2713,57 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) U8* tsave = from_utf ? NULL : trlist_upgrade(&t, &tend); U8* rsave = (to_utf || !rlen) ? NULL : trlist_upgrade(&r, &rend); +/* There are several snags with this code on EBCDIC: + 1. 0xFF is a legal UTF-EBCDIC byte (there are no illegal bytes). + 2. scan_const() in toke.c has encoded chars in native encoding which makes + ranges at least in EBCDIC 0..255 range the bottom odd. +*/ + if (complement) { U8 tmpbuf[UTF8_MAXLEN+1]; - U8** cp; + UV *cp; UV nextmin = 0; - New(1109, cp, tlen, U8*); + New(1109, cp, 2*tlen, UV); i = 0; transv = newSVpvn("",0); while (t < tend) { - cp[i++] = t; - t += UTF8SKIP(t); - if (t < tend && *t == 0xff) { + cp[2*i] = utf8n_to_uvuni(t, tend-t, &ulen, 0); + t += ulen; + if (t < tend && NATIVE_TO_UTF(*t) == 0xff) { t++; - t += UTF8SKIP(t); + cp[2*i+1] = utf8n_to_uvuni(t, tend-t, &ulen, 0); + t += ulen; } + else { + cp[2*i+1] = cp[2*i]; + } + i++; } - qsort(cp, i, sizeof(U8*), utf8compare); + qsort(cp, i, 2*sizeof(UV), uvcompare); for (j = 0; j < i; j++) { - U8 *s = cp[j]; - I32 cur = j < i - 1 ? cp[j+1] - s : tend - s; - /* CHECKME: Use unicode code points for ranges - needs more thought ... NI-S */ - UV val = utf8n_to_uvuni(s, cur, &ulen, 0); - s += ulen; + UV val = cp[2*j]; diff = val - nextmin; if (diff > 0) { t = uvuni_to_utf8(tmpbuf,nextmin); sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf); if (diff > 1) { + U8 range_mark = UTF_TO_NATIVE(0xff); t = uvuni_to_utf8(tmpbuf, val - 1); - sv_catpvn(transv, "\377", 1); + sv_catpvn(transv, (char *)&range_mark, 1); sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf); } } - if (s < tend && *s == 0xff) - val = utf8n_to_uvuni(s+1, cur - 1, &ulen, 0); + val = cp[2*j+1]; if (val >= nextmin) nextmin = val + 1; } t = uvuni_to_utf8(tmpbuf,nextmin); sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf); + { + U8 range_mark = UTF_TO_NATIVE(0xff); + sv_catpvn(transv, (char *)&range_mark, 1); + } t = uvuni_to_utf8(tmpbuf, 0x7fffffff); - sv_catpvn(transv, "\377", 1); sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf); t = (U8*)SvPVX(transv); tlen = SvCUR(transv); @@ -2775,7 +2786,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) if (tfirst > tlast) { tfirst = (I32)utf8n_to_uvuni(t, tend - t, &ulen, 0); t += ulen; - if (t < tend && *t == 0xff) { /* illegal utf8 val indicates range */ + if (t < tend && NATIVE_TO_UTF(*t) == 0xff) { /* illegal utf8 val indicates range */ t++; tlast = (I32)utf8n_to_uvuni(t, tend - t, &ulen, 0); t += ulen; @@ -2789,7 +2800,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) if (r < rend) { rfirst = (I32)utf8n_to_uvuni(r, rend - r, &ulen, 0); r += ulen; - if (r < rend && *r == 0xff) { /* illegal utf8 val indicates range */ + if (r < rend && NATIVE_TO_UTF(*r) == 0xff) { /* illegal utf8 val indicates range */ r++; rlast = (I32)utf8n_to_uvuni(r, rend - r, &ulen, 0); r += ulen; @@ -7040,3 +7051,4 @@ const_sv_xsub(pTHXo_ CV* cv) ST(0) = (SV*)XSANY.any_ptr; XSRETURN(1); } + @@ -2920,9 +2920,7 @@ tryagain: if (ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; - /* ender is a Unicode value so it can be > 0xff -- - * in other words, do not use UTF8_IS_CONTINUED(). */ - else if (NATIVE_TO_ASCII(ender) >= 0x80 && UTF) { + else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) { reguni(pRExC_state, ender, s, &numlen); s += numlen; len += numlen; @@ -2933,9 +2931,7 @@ tryagain: } break; } - /* ender is a Unicode value so it can be > 0xff -- - * in other words, do not use UTF8_IS_CONTINUED(). */ - if (NATIVE_TO_ASCII(ender) >= 0x80 && UTF) { + if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) { reguni(pRExC_state, ender, s, &numlen); s += numlen; len += numlen - 1; @@ -4251,7 +4247,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) U8 s[UTF8_MAXLEN+1]; for (i = 0; i <= 256; i++) { /* just the first 256 */ - U8 *e = uvuni_to_utf8(s, i); + U8 *e = uvchr_to_utf8(s, i); if (i < 256 && swash_fetch(sw, s)) { if (rangestart == -1) @@ -4261,14 +4257,14 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) if (i <= rangestart + 3) for (; rangestart < i; rangestart++) { - for(e = uvuni_to_utf8(s, rangestart), p = s; p < e; p++) + for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++) put_byte(sv, *p); } else { - for (e = uvuni_to_utf8(s, rangestart), p = s; p < e; p++) + for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++) put_byte(sv, *p); sv_catpv(sv, "-"); - for (e = uvuni_to_utf8(s, i - 1), p = s; p < e; p++) + for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++) put_byte(sv, *p); } rangestart = -1; @@ -950,7 +950,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta else { U8 *r = reghop3((U8*)s, -1, (U8*)startpos); - tmp = (I32)utf8n_to_uvuni(r, s - (char*)r, 0, 0); + tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0); } tmp = ((OP(c) == BOUND ? isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); @@ -993,7 +993,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta else { U8 *r = reghop3((U8*)s, -1, (U8*)startpos); - tmp = (I32)utf8n_to_uvuni(r, s - (char*)r, 0, 0); + tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0); } tmp = ((OP(c) == NBOUND ? isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); @@ -1433,7 +1433,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * else { if (prog->reganch & ROPT_UTF8 && do_utf8) { U8 *s = reghop3((U8*)stringarg, -1, (U8*)strbeg); - PL_regprev = utf8n_to_uvuni(s, (U8*)stringarg - s, NULL, 0); + PL_regprev = utf8n_to_uvchr(s, (U8*)stringarg - s, NULL, 0); } else PL_regprev = (U32)stringarg[-1]; @@ -2161,7 +2161,7 @@ S_regmatch(pTHX_ regnode *prog) if (l >= PL_regeol) { sayNO; } - if ((UTF ? utf8n_to_uvuni((U8*)s, e - s, 0, 0) : *((U8*)s)) != + if ((UTF ? utf8n_to_uvchr((U8*)s, e - s, 0, 0) : *((U8*)s)) != (c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l))) sayNO; s += UTF ? UTF8SKIP(s) : 1; @@ -2263,7 +2263,7 @@ S_regmatch(pTHX_ regnode *prog) else { U8 *r = reghop((U8*)locinput, -1); - ln = utf8n_to_uvuni(r, s - (char*)r, 0, 0); + ln = utf8n_to_uvchr(r, s - (char*)r, 0, 0); } if (OP(scan) == BOUND || OP(scan) == NBOUND) { ln = isALNUM_uni(ln); @@ -4733,8 +4733,9 @@ Perl_sv_pos_b2u(pTHX_ register SV *sv, I32* offsetp) len = 0; while (s < send) { STRLEN n; - /* We can use low level directly here as we are not looking at the values */ - if (utf8n_to_uvuni(s, UTF8SKIP(s), &n, 0)) { + /* Call utf8n_to_uvchr() to validate the sequence */ + utf8n_to_uvchr(s, UTF8SKIP(s), &n, 0); + if (n > 0) { s += n; len++; } @@ -36,8 +36,12 @@ static I32 utf16rev_textfilter(pTHXo_ int idx, SV *sv, int maxlen); #define XFAKEBRACK 128 #define XENUMMASK 127 -/*#define UTF (SvUTF8(PL_linestr) && !(PL_hints & HINT_BYTE))*/ -#define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || PL_hints & HINT_UTF8) +#ifdef EBCDIC +/* For now 'use utf8' does not affect tokenizer on EBCDIC */ +#define UTF (PL_linestr && DO_UTF8(PL_linestr)) +#else +#define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || (PL_hints & HINT_UTF8)) +#endif /* In variables name $^X, these are the legal values for X. * 1999-02-27 mjd-perl-patch@plover.com */ @@ -1218,22 +1222,22 @@ S_scan_const(pTHX_ char *start) register char *d = SvPVX(sv); /* destination for copies */ bool dorange = FALSE; /* are we in a translit range? */ bool didrange = FALSE; /* did we just finish a range? */ - bool has_utf8 = (PL_linestr && SvUTF8(PL_linestr)); - /* the constant is UTF8 */ + I32 has_utf8 = FALSE; /* Output constant is UTF8 */ + I32 this_utf8 = UTF; /* The source string is assumed to be UTF8 */ UV uv; - I32 utf = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) - ? (PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF)) - : UTF; - I32 this_utf8 = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) - ? (PL_sublex_info.sub_op->op_private & (PL_lex_repl ? - OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF)) - : UTF; const char *leaveit = /* set of acceptably-backslashed characters */ PL_lex_inpat ? "\\.^$@AGZdDwWsSbBpPXC+*?|()-nrtfeaxcz0123456789[{]} \t\n\r\f\v#" : ""; + if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) { + /* If we are doing a trans and we know we want UTF8 set expectation */ + has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF); + this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); + } + + while (s < send || dorange) { /* get transliterations out of the way (they're most literal) */ if (PL_lex_inwhat == OP_TRANS) { @@ -1243,17 +1247,18 @@ S_scan_const(pTHX_ char *start) I32 min; /* first character in range */ I32 max; /* last character in range */ - if (utf) { + if (has_utf8) { char *c = (char*)utf8_hop((U8*)d, -1); char *e = d++; while (e-- > c) *(e + 1) = *e; - *c = (char)0xff; + *c = UTF_TO_NATIVE(0xff); /* mark the range as done, and continue */ dorange = FALSE; didrange = TRUE; continue; } + i = d - SvPVX(sv); /* remember current offset */ SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */ d = SvPVX(sv) + i; /* refresh d after realloc */ @@ -1297,8 +1302,8 @@ S_scan_const(pTHX_ char *start) if (didrange) { Perl_croak(aTHX_ "Ambiguous range in transliteration operator"); } - if (utf) { - *d++ = (char)0xff; /* use illegal utf8 byte--see pmtrans */ + if (has_utf8) { + *d++ = UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */ s++; continue; } @@ -1368,6 +1373,8 @@ S_scan_const(pTHX_ char *start) break; /* in regexp, $ might be tail anchor */ } + /* End of else if chain - OP_TRANS rejoin rest */ + /* backslashes */ if (*s == '\\' && s+1 < send) { s++; @@ -1502,7 +1509,6 @@ S_scan_const(pTHX_ char *start) PL_sublex_info.sub_op->op_private |= (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); - utf = TRUE; } } else { @@ -1603,40 +1609,40 @@ S_scan_const(pTHX_ char *start) } /* end if (backslash) */ default_action: - if (!UTF8_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) { - STRLEN len = (STRLEN) -1; - UV uv; - if (this_utf8) { - uv = utf8n_to_uvchr((U8*)s, send - s, &len, 0); - } - if (len == (STRLEN)-1) { - /* Illegal UTF8 (a high-bit byte), make it valid. */ - char *old_pvx = SvPVX(sv); - /* need space for one extra char (NOTE: SvCUR() not set here) */ - d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx); - d = (char*)uvchr_to_utf8((U8*)d, (U8)*s++); - } - else { - while (len--) - *d++ = *s++; - } - has_utf8 = TRUE; - if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) { - PL_sublex_info.sub_op->op_private |= - (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); - utf = TRUE; - } - continue; - } - *d++ = NATIVE_TO_NEED(has_utf8,*s++); + /* If we started with encoded form, or already know we want it + and then encode the next character */ + if ((has_utf8 || this_utf8) && !NATIVE_IS_INVARIANT((U8)(*s))) { + STRLEN len = 1; + UV uv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s); + STRLEN need = UNISKIP(NATIVE_TO_UNI(uv)); + s += len; + if (need > len) { + /* encoded value larger than old, need extra space (NOTE: SvCUR() not set here) */ + STRLEN off = d - SvPVX(sv); + d = SvGROW(sv, SvLEN(sv) + (need-len)) + off; + } + d = (char*)uvchr_to_utf8((U8*)d, uv); + has_utf8 = TRUE; + } + else { + *d++ = NATIVE_TO_NEED(has_utf8,*s++); + } } /* while loop to process each character */ /* terminate the string and set up the sv */ *d = '\0'; SvCUR_set(sv, d - SvPVX(sv)); + if (SvCUR(sv) >= SvLEN(sv)) + Perl_croak(aTHX_ "panic:constant overflowed allocated space"); + SvPOK_on(sv); - if (has_utf8) + if (has_utf8) { SvUTF8_on(sv); + if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) { + PL_sublex_info.sub_op->op_private |= + (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); + } + } /* shrink the sv if we allocated more than we used */ if (SvCUR(sv) + 5 < SvLEN(sv)) { @@ -131,28 +131,6 @@ Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv) #endif /* Loop style */ } -/* -=for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv - -Adds the UTF8 representation of the Native codepoint C<uv> to the end -of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free -bytes available. The return value is the pointer to the byte after the -end of the new character. In other words, - - d = uvchr_to_utf8(d, uv); - -is the recommended wide native character-aware way of saying - - *(d++) = uv; - -=cut -*/ - -U8 * -Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv) -{ - return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv)); -} /* @@ -461,25 +439,6 @@ malformed: } /* -=for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags - -Returns the native character value of the first character in the string C<s> -which is assumed to be in UTF8 encoding; C<retlen> will be set to the -length, in bytes, of that character. - -Allows length and flags to be passed to low level routine. - -=cut -*/ - -UV -Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) -{ - UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags); - return UNI_TO_NATIVE(uv); -} - -/* =for apidoc A|U8* s|utf8_to_uvchr|STRLEN *retlen Returns the native character value of the first character in the string C<s> @@ -835,7 +794,7 @@ bool Perl_is_uni_alnum(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uvuni_to_utf8(tmpbuf, (UV)c); + uvchr_to_utf8(tmpbuf, (UV)c); return is_utf8_alnum(tmpbuf); } @@ -843,7 +802,7 @@ bool Perl_is_uni_alnumc(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uvuni_to_utf8(tmpbuf, (UV)c); + uvchr_to_utf8(tmpbuf, (UV)c); return is_utf8_alnumc(tmpbuf); } @@ -851,7 +810,7 @@ bool Perl_is_uni_idfirst(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uvuni_to_utf8(tmpbuf, (UV)c); + uvchr_to_utf8(tmpbuf, (UV)c); return is_utf8_idfirst(tmpbuf); } @@ -859,7 +818,7 @@ bool Perl_is_uni_alpha(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uvuni_to_utf8(tmpbuf, (UV)c); + uvchr_to_utf8(tmpbuf, (UV)c); return is_utf8_alpha(tmpbuf); } @@ -867,7 +826,7 @@ bool Perl_is_uni_ascii(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uvuni_to_utf8(tmpbuf, (UV)c); + uvchr_to_utf8(tmpbuf, (UV)c); return is_utf8_ascii(tmpbuf); } @@ -875,7 +834,7 @@ bool Perl_is_uni_space(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uvuni_to_utf8(tmpbuf, (UV)c); + uvchr_to_utf8(tmpbuf, (UV)c); return is_utf8_space(tmpbuf); } @@ -883,7 +842,7 @@ bool Perl_is_uni_digit(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uvuni_to_utf8(tmpbuf, (UV)c); + uvchr_to_utf8(tmpbuf, (UV)c); return is_utf8_digit(tmpbuf); } @@ -891,7 +850,7 @@ bool Perl_is_uni_upper(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uvuni_to_utf8(tmpbuf, (UV)c); + uvchr_to_utf8(tmpbuf, (UV)c); return is_utf8_upper(tmpbuf); } @@ -899,7 +858,7 @@ bool Perl_is_uni_lower(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uvuni_to_utf8(tmpbuf, (UV)c); + uvchr_to_utf8(tmpbuf, (UV)c); return is_utf8_lower(tmpbuf); } @@ -907,7 +866,7 @@ bool Perl_is_uni_cntrl(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uvuni_to_utf8(tmpbuf, (UV)c); + uvchr_to_utf8(tmpbuf, (UV)c); return is_utf8_cntrl(tmpbuf); } @@ -915,7 +874,7 @@ bool Perl_is_uni_graph(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uvuni_to_utf8(tmpbuf, (UV)c); + uvchr_to_utf8(tmpbuf, (UV)c); return is_utf8_graph(tmpbuf); } @@ -923,7 +882,7 @@ bool Perl_is_uni_print(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uvuni_to_utf8(tmpbuf, (UV)c); + uvchr_to_utf8(tmpbuf, (UV)c); return is_utf8_print(tmpbuf); } @@ -931,7 +890,7 @@ bool Perl_is_uni_punct(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uvuni_to_utf8(tmpbuf, (UV)c); + uvchr_to_utf8(tmpbuf, (UV)c); return is_utf8_punct(tmpbuf); } @@ -939,7 +898,7 @@ bool Perl_is_uni_xdigit(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uvuni_to_utf8(tmpbuf, (UV)c); + uvchr_to_utf8(tmpbuf, (UV)c); return is_utf8_xdigit(tmpbuf); } @@ -947,7 +906,7 @@ U32 Perl_to_uni_upper(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uvuni_to_utf8(tmpbuf, (UV)c); + uvchr_to_utf8(tmpbuf, (UV)c); return to_utf8_upper(tmpbuf); } @@ -955,7 +914,7 @@ U32 Perl_to_uni_title(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uvuni_to_utf8(tmpbuf, (UV)c); + uvchr_to_utf8(tmpbuf, (UV)c); return to_utf8_title(tmpbuf); } @@ -963,7 +922,7 @@ U32 Perl_to_uni_lower(pTHX_ U32 c) { U8 tmpbuf[UTF8_MAXLEN+1]; - uvuni_to_utf8(tmpbuf, (UV)c); + uvchr_to_utf8(tmpbuf, (UV)c); return to_utf8_lower(tmpbuf); } @@ -1352,6 +1311,10 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr) /* If not cached, generate it via utf8::SWASHGET */ if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) { dSP; + /* We use utf8n_to_uvuni() as we want an index into + Unicode tables, not a native character number. + */ + UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0); ENTER; SAVETMPS; save_re_context(); @@ -1359,10 +1322,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr) PUSHMARK(SP); EXTEND(SP,3); PUSHs((SV*)sv); - /* We call utf8_to_uni as we want and index into Unicode tables, - not a native character number. - */ - PUSHs(sv_2mortal(newSViv(utf8_to_uvuni(ptr, 0) & ~(needents - 1)))); + PUSHs(sv_2mortal(newSViv(code_point & ~(needents - 1)))); PUSHs(sv_2mortal(newSViv(needents))); PUTBACK; if (call_method("SWASHGET", G_SCALAR)) @@ -1406,3 +1366,56 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr) Perl_croak(aTHX_ "panic: swash_fetch"); return 0; } + + +/* +=for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv + +Adds the UTF8 representation of the Native codepoint C<uv> to the end +of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free +bytes available. The return value is the pointer to the byte after the +end of the new character. In other words, + + d = uvchr_to_utf8(d, uv); + +is the recommended wide native character-aware way of saying + + *(d++) = uv; + +=cut +*/ + +/* On ASCII machines this is normally a macro but we want a + real function in case XS code wants it +*/ +#undef Perl_uvchr_to_utf8 +U8 * +Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv) +{ + return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv)); +} + + +/* +=for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags + +Returns the native character value of the first character in the string C<s> +which is assumed to be in UTF8 encoding; C<retlen> will be set to the +length, in bytes, of that character. + +Allows length and flags to be passed to low level routine. + +=cut +*/ +/* On ASCII machines this is normally a macro but we want a + real function in case XS code wants it +*/ +#undef Perl_utf8n_to_uvchr +UV +Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) +{ + UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags); + return UNI_TO_NATIVE(uv); +} + + @@ -48,6 +48,10 @@ END_EXTERN_C #define NATIVE_TO_NEED(enc,ch) (ch) #define ASCII_TO_NEED(enc,ch) (ch) +/* As there are no translations avoid the function wrapper */ +#define Perl_utf8n_to_uvchr Perl_utf8n_to_uvuni +#define Perl_uvchr_to_utf8 Perl_uvuni_to_utf8 + /* The following table is from Unicode 3.1. diff --git a/utfebcdic.h b/utfebcdic.h index ef67cb2c35..0dd73d2bb0 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -274,7 +274,7 @@ END_EXTERN_C #define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_ASCII(c)) #define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c)) #define UTF8_IS_START(c) (NATIVE_TO_UTF(c) >= 0xA0 && (NATIVE_TO_UTF(c) & 0xE0) != 0xA0) -#define UTF8_IS_CONTINUATION(c) (NATIVE_TO_UTF(c) >= 0xA0 && (NATIVE_TO_UTF(c) & 0xE0) == 0xA0) +#define UTF8_IS_CONTINUATION(c) ((NATIVE_TO_UTF(c) & 0xE0) == 0xA0) #define UTF8_IS_CONTINUED(c) (NATIVE_TO_UTF(c) >= 0xA0) #define UTF8_IS_DOWNGRADEABLE_START(c) (NATIVE_TO_UTF(c) >= 0xA0 && (NATIVE_TO_UTF(c) & 0xF8) == 0xC0) |