diff options
author | Inaba Hiroto <inaba@st.rim.or.jp> | 2000-12-30 23:27:10 +0900 |
---|---|---|
committer | Jarkko Hietaniemi <jhi@iki.fi> | 2000-12-30 17:14:19 +0000 |
commit | 1aa99e6b6d14c469ac825dde483d9c9f913a3ee2 (patch) | |
tree | 76ac8dd0d9473e84a19f6184baa4eddb6337e7b1 /doop.c | |
parent | 13e8c8e316d3839d0834fb8b851566b00d81e876 (diff) | |
download | perl-1aa99e6b6d14c469ac825dde483d9c9f913a3ee2.tar.gz |
more UTF8 test suites and an UTF8 patch
Message-ID: <3A4D722D.243AFD88@st.rim.or.jp>
Just the patch part for now, and the pragma renamed
as unicode::distinct.
p4raw-id: //depot/perl@8267
Diffstat (limited to 'doop.c')
-rw-r--r-- | doop.c | 333 |
1 files changed, 186 insertions, 147 deletions
@@ -21,18 +21,6 @@ #endif #endif -#define HALF_UTF8_UPGRADE(start,end) \ - STMT_START { \ - if ((start)<(end)) { \ - U8* NeWsTr; \ - STRLEN LeN = (end) - (start); \ - NeWsTr = bytes_to_utf8(start, &LeN); \ - Safefree(start); \ - (start) = NeWsTr; \ - (end) = (start) + LeN; \ - } \ - } STMT_END - STATIC I32 S_do_trans_simple(pTHX_ SV *sv) { @@ -41,7 +29,6 @@ S_do_trans_simple(pTHX_ SV *sv) U8 *send; U8 *dstart; I32 matches = 0; - I32 sutf = SvUTF8(sv); STRLEN len; short *tbl; I32 ch; @@ -54,7 +41,7 @@ S_do_trans_simple(pTHX_ SV *sv) send = s + len; /* First, take care of non-UTF8 input strings, because they're easy */ - if (!sutf) { + if (!SvUTF8(sv)) { while (s < send) { if ((ch = tbl[*s]) >= 0) { matches++; @@ -79,7 +66,10 @@ S_do_trans_simple(pTHX_ SV *sv) c = utf8_to_uv(s, send - s, &ulen, 0); if (c < 0x100 && (ch = tbl[(short)c]) >= 0) { matches++; - d = uv_to_utf8(d,ch); + if (ch < 0x80) + *d++ = ch; + else + d = uv_to_utf8(d,ch); s += ulen; } else { /* No match -> copy */ @@ -88,8 +78,7 @@ S_do_trans_simple(pTHX_ SV *sv) } } *d = '\0'; - sv_setpvn(sv, (const char*)dstart, d - dstart); - Safefree(dstart); + sv_setpvn(sv, (char*)dstart, d - dstart); SvUTF8_on(sv); SvSETMAGIC(sv); return matches; @@ -101,7 +90,6 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */ U8 *s; U8 *send; I32 matches = 0; - I32 hasutf = SvUTF8(sv); STRLEN len; short *tbl; @@ -112,22 +100,20 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */ s = (U8*)SvPV(sv, len); send = s + len; - while (s < send) { - if (hasutf && *s & 0x80) - s += UTF8SKIP(s); - else { - UV c; - STRLEN ulen; - ulen = 1; - if (hasutf) - c = utf8_to_uv(s, send - s, &ulen, 0); - else - c = *s; - if (c < 0x100 && tbl[c] >= 0) + if (!SvUTF8(sv)) + while (s < send) { + if (tbl[*s++] >= 0) matches++; - s += ulen; - } - } + } + else + while (s < send) { + UV c; + STRLEN ulen; + c = utf8_to_uv(s, send - s, &ulen, 0); + if (c < 0x100 && tbl[c] >= 0) + matches++; + s += ulen; + } return matches; } @@ -139,7 +125,7 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */ U8 *send; U8 *d; U8 *dstart; - I32 hasutf = SvUTF8(sv); + I32 isutf8; I32 matches = 0; STRLEN len; short *tbl; @@ -150,64 +136,96 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */ Perl_croak(aTHX_ "panic: do_trans_complex"); s = (U8*)SvPV(sv, len); + isutf8 = SvUTF8(sv); send = s + len; - Newz(0, d, len*2+1, U8); - dstart = d; - - if (PL_op->op_private & OPpTRANS_SQUASH) { - U8* p = send; - - while (s < send) { - if (hasutf && *s & 0x80) - s += UTF8SKIP(s); - else { - if ((ch = tbl[*s]) >= 0) { + if (!isutf8) { + dstart = d = s; + if (PL_op->op_private & OPpTRANS_SQUASH) { + U8* p = send; + while (s < send) { + if ((ch = tbl[*s]) >= 0) { *d = ch; matches++; - if (p != d - 1 || *p != *d) - p = d++; - } - else if (ch == -1) /* -1 is unmapped character */ - *d++ = *s; /* -2 is delete character */ - s++; - } + if (p != d - 1 || *p != *d) + p = d++; + } + else if (ch == -1) /* -1 is unmapped character */ + *d++ = *s; + else if (ch == -2) /* -2 is delete character */ + matches++; + s++; + } } + else { + while (s < send) { + if ((ch = tbl[*s]) >= 0) { + matches++; + *d++ = ch; + } + else if (ch == -1) /* -1 is unmapped character */ + *d++ = *s; + else if (ch == -2) /* -2 is delete character */ + matches++; + s++; + } + } + SvCUR_set(sv, d - dstart); } - else { - while (s < send) { - UV comp; - if (hasutf && *s & 0x80) - comp = utf8_to_uv_simple(s, NULL); - else - comp = *s; - - ch = tbl[comp]; - - if (ch == -1) { /* -1 is unmapped character */ - ch = comp; - matches--; - } - - if (ch >= 0) - d = uv_to_utf8(d, ch); - - matches++; - - s += hasutf && *s & 0x80 ? UNISKIP(*s) : 1; - + else { /* isutf8 */ + Newz(0, d, len*2+1, U8); + dstart = d; + + if (PL_op->op_private & OPpTRANS_SQUASH) { + U8* p = send; + UV pch = 0xfeedface; + while (s < send) { + STRLEN len; + UV comp = utf8_to_uv_simple(s, &len); + + if (comp > 0xff) + d = uv_to_utf8(d, comp); /* always unmapped */ + else if ((ch = tbl[comp]) >= 0) { + matches++; + if (ch != pch) { + d = uv_to_utf8(d, ch); + pch = ch; + } + s += len; + continue; + } + else if (ch == -1) /* -1 is unmapped character */ + d = uv_to_utf8(d, comp); + else if (ch == -2) /* -2 is delete character */ + matches++; + s += len; + pch = 0xfeedface; + } } + else { + while (s < send) { + STRLEN len; + UV comp = utf8_to_uv_simple(s, &len); + if (comp > 0xff) + d = uv_to_utf8(d, comp); /* always unmapped */ + else if ((ch = tbl[comp]) >= 0) { + d = uv_to_utf8(d, ch); + matches++; + } + else if (ch == -1) { /* -1 is unmapped character */ + d = uv_to_utf8(d, comp); + } + else if (ch == -2) /* -2 is delete character */ + matches++; + s += len; + } + } + *d = '\0'; + sv_setpvn(sv, (char*)dstart, d - dstart); + SvUTF8_on(sv); } - - *d = '\0'; - - sv_setpvn(sv, (const char*)dstart, d - dstart); - Safefree(dstart); - if (hasutf) - SvUTF8_on(sv); SvSETMAGIC(sv); return matches; - } STATIC I32 @@ -217,7 +235,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */ U8 *send; U8 *d; U8 *start; - U8 *dstart; + U8 *dstart, *dend; I32 matches = 0; STRLEN len; @@ -228,11 +246,19 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */ UV extra = none + 1; UV final; UV uv; - I32 isutf; - I32 howmany; + I32 isutf8; + U8 hibit = 0; - isutf = SvUTF8(sv); s = (U8*)SvPV(sv, len); + isutf8 = SvUTF8(sv); + if (!isutf8) { + U8 *t = s, *e = s + len; + while (t < e) + if ((hibit = *t++ & 0x80)) + break; + if (hibit) + s = bytes_to_utf8(s, &len); + } send = s + len; start = s; @@ -241,41 +267,46 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */ final = SvUV(*svp); /* d needs to be bigger than s, in case e.g. upgrading is required */ - Newz(0, d, len*2+1, U8); + New(0, d, len*3+UTF8_MAXLEN, U8); + dend = d + len * 3; dstart = d; + while (s < send) { if ((uv = swash_fetch(rv, s)) < none) { s += UTF8SKIP(s); matches++; - if ((uv & 0x80) && !isutf++) - HALF_UTF8_UPGRADE(dstart,d); d = uv_to_utf8(d, uv); } else if (uv == none) { - int i; - i = UTF8SKIP(s); - if (i > 1 && !isutf++) - HALF_UTF8_UPGRADE(dstart,d); + int i = UTF8SKIP(s); while(i--) *d++ = *s++; } else if (uv == extra) { - int i; - i = UTF8SKIP(s); + int i = UTF8SKIP(s); s += i; matches++; - if (i > 1 && !isutf++) - HALF_UTF8_UPGRADE(dstart,d); d = uv_to_utf8(d, final); } else s += UTF8SKIP(s); + + if (d >= dend) { + STRLEN clen = d - dstart; + STRLEN nlen = dend - dstart + len + UTF8_MAXLEN; + Renew(dstart, nlen+UTF8_MAXLEN, U8); + d = dstart + clen; + dend = dstart + nlen; + } } *d = '\0'; - sv_setpvn(sv, (const char*)dstart, d - dstart); + sv_setpvn(sv, (char*)dstart, d - dstart); SvSETMAGIC(sv); - if (isutf) - SvUTF8_on(sv); + SvUTF8_on(sv); + if (hibit) + Safefree(start); + if (!isutf8 && !(PL_hints & HINT_UTF8)) + sv_utf8_downgrade(sv, TRUE); return matches; } @@ -284,7 +315,7 @@ STATIC I32 S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */ { U8 *s; - U8 *send; + U8 *start, *send; I32 matches = 0; STRLEN len; @@ -293,10 +324,17 @@ S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */ SV** svp = hv_fetch(hv, "NONE", 4, FALSE); UV none = svp ? SvUV(*svp) : 0x7fffffff; UV uv; + U8 hibit = 0; s = (U8*)SvPV(sv, len); - if (!SvUTF8(sv)) - s = bytes_to_utf8(s, &len); + if (!SvUTF8(sv)) { + U8 *t = s, *e = s + len; + while (t < e) + if ((hibit = *t++ & 0x80)) + break; + if (hibit) + start = s = bytes_to_utf8(s, &len); + } send = s + len; while (s < send) { @@ -304,6 +342,8 @@ S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */ matches++; s += UTF8SKIP(s); } + if (hibit) + Safefree(start); return matches; } @@ -312,7 +352,7 @@ STATIC I32 S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ { U8 *s; - U8 *send; + U8 *start, *send; U8 *d; I32 matches = 0; I32 squash = PL_op->op_private & OPpTRANS_SQUASH; @@ -325,41 +365,45 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ UV final; UV uv; STRLEN len; - U8 *dst; - I32 isutf = SvUTF8(sv); + U8 *dstart, *dend; + I32 isutf8; + U8 hibit = 0; s = (U8*)SvPV(sv, len); + isutf8 = SvUTF8(sv); + if (!isutf8) { + U8 *t = s, *e = s + len; + while (t < e) + if ((hibit = *t++ & 0x80)) + break; + if (hibit) + s = bytes_to_utf8(s, &len); + } send = s + len; + start = s; svp = hv_fetch(hv, "FINAL", 5, FALSE); if (svp) final = SvUV(*svp); - Newz(0, d, len*2+1, U8); - dst = d; + New(0, d, len*3+UTF8_MAXLEN, U8); + dend = d + len * 3; + dstart = d; if (squash) { UV puv = 0xfeedface; while (s < send) { - if (SvUTF8(sv)) - uv = swash_fetch(rv, s); - else { - U8 tmpbuf[2]; - uv = *s++; - if (uv < 0x80) - tmpbuf[0] = uv; - else { - tmpbuf[0] = (( uv >> 6) | 0xc0); - tmpbuf[1] = (( uv & 0x3f) | 0x80); - } - uv = swash_fetch(rv, tmpbuf); + uv = swash_fetch(rv, s); + + if (d >= dend) { + STRLEN clen = d - dstart, nlen = dend - dstart + len; + Renew(dstart, nlen+UTF8_MAXLEN, U8); + d = dstart + clen; + dend = dstart + nlen; } - if (uv < none) { matches++; if (uv != puv) { - if ((uv & 0x80) && !isutf++) - HALF_UTF8_UPGRADE(dst,d); d = uv_to_utf8(d, uv); puv = uv; } @@ -367,9 +411,9 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ continue; } else if (uv == none) { /* "none" is unmapped character */ - STRLEN ulen; - *d++ = (U8)utf8_to_uv(s, send - s, &ulen, 0); - s += ulen; + int i = UTF8SKIP(s); + while(i--) + *d++ = *s++; puv = 0xfeedface; continue; } @@ -388,18 +432,12 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ } else { while (s < send) { - if (SvUTF8(sv)) - uv = swash_fetch(rv, s); - else { - U8 tmpbuf[2]; - uv = *s++; - if (uv < 0x80) - tmpbuf[0] = uv; - else { - tmpbuf[0] = (( uv >> 6) | 0xc0); - tmpbuf[1] = (( uv & 0x3f) | 0x80); - } - uv = swash_fetch(rv, tmpbuf); + uv = swash_fetch(rv, s); + if (d >= dend) { + STRLEN clen = d - dstart, nlen = dend - dstart + len; + Renew(dstart, nlen+UTF8_MAXLEN, U8); + d = dstart + clen; + dend = dstart + nlen; } if (uv < none) { matches++; @@ -408,9 +446,9 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ continue; } else if (uv == none) { /* "none" is unmapped character */ - STRLEN ulen; - *d++ = (U8)utf8_to_uv(s, send - s, &ulen, 0); - s += ulen; + int i = UTF8SKIP(s); + while(i--) + *d++ = *s++; continue; } else if (uv == extra && !del) { @@ -423,12 +461,13 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ s += UTF8SKIP(s); } } - if (dst) - sv_usepvn(sv, (char*)dst, d - dst); - else { - *d = '\0'; - SvCUR_set(sv, d - (U8*)SvPVX(sv)); - } + *d = '\0'; + sv_setpvn(sv, (char*)dstart, d - dstart); + SvUTF8_on(sv); + if (hibit) + Safefree(start); + if (!isutf8 && !(PL_hints & HINT_UTF8)) + sv_utf8_downgrade(sv, TRUE); SvSETMAGIC(sv); return matches; |