summaryrefslogtreecommitdiff
path: root/doop.c
diff options
context:
space:
mode:
authorInaba Hiroto <inaba@st.rim.or.jp>2000-12-30 23:27:10 +0900
committerJarkko Hietaniemi <jhi@iki.fi>2000-12-30 17:14:19 +0000
commit1aa99e6b6d14c469ac825dde483d9c9f913a3ee2 (patch)
tree76ac8dd0d9473e84a19f6184baa4eddb6337e7b1 /doop.c
parent13e8c8e316d3839d0834fb8b851566b00d81e876 (diff)
downloadperl-1aa99e6b6d14c469ac825dde483d9c9f913a3ee2.tar.gz
more UTF8 test suites and an UTF8 patch
Message-ID: <3A4D722D.243AFD88@st.rim.or.jp> Just the patch part for now, and the pragma renamed as unicode::distinct. p4raw-id: //depot/perl@8267
Diffstat (limited to 'doop.c')
-rw-r--r--doop.c333
1 files changed, 186 insertions, 147 deletions
diff --git a/doop.c b/doop.c
index 3548556a89..8256b93d53 100644
--- a/doop.c
+++ b/doop.c
@@ -21,18 +21,6 @@
#endif
#endif
-#define HALF_UTF8_UPGRADE(start,end) \
- STMT_START { \
- if ((start)<(end)) { \
- U8* NeWsTr; \
- STRLEN LeN = (end) - (start); \
- NeWsTr = bytes_to_utf8(start, &LeN); \
- Safefree(start); \
- (start) = NeWsTr; \
- (end) = (start) + LeN; \
- } \
- } STMT_END
-
STATIC I32
S_do_trans_simple(pTHX_ SV *sv)
{
@@ -41,7 +29,6 @@ S_do_trans_simple(pTHX_ SV *sv)
U8 *send;
U8 *dstart;
I32 matches = 0;
- I32 sutf = SvUTF8(sv);
STRLEN len;
short *tbl;
I32 ch;
@@ -54,7 +41,7 @@ S_do_trans_simple(pTHX_ SV *sv)
send = s + len;
/* First, take care of non-UTF8 input strings, because they're easy */
- if (!sutf) {
+ if (!SvUTF8(sv)) {
while (s < send) {
if ((ch = tbl[*s]) >= 0) {
matches++;
@@ -79,7 +66,10 @@ S_do_trans_simple(pTHX_ SV *sv)
c = utf8_to_uv(s, send - s, &ulen, 0);
if (c < 0x100 && (ch = tbl[(short)c]) >= 0) {
matches++;
- d = uv_to_utf8(d,ch);
+ if (ch < 0x80)
+ *d++ = ch;
+ else
+ d = uv_to_utf8(d,ch);
s += ulen;
}
else { /* No match -> copy */
@@ -88,8 +78,7 @@ S_do_trans_simple(pTHX_ SV *sv)
}
}
*d = '\0';
- sv_setpvn(sv, (const char*)dstart, d - dstart);
- Safefree(dstart);
+ sv_setpvn(sv, (char*)dstart, d - dstart);
SvUTF8_on(sv);
SvSETMAGIC(sv);
return matches;
@@ -101,7 +90,6 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
U8 *s;
U8 *send;
I32 matches = 0;
- I32 hasutf = SvUTF8(sv);
STRLEN len;
short *tbl;
@@ -112,22 +100,20 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
s = (U8*)SvPV(sv, len);
send = s + len;
- while (s < send) {
- if (hasutf && *s & 0x80)
- s += UTF8SKIP(s);
- else {
- UV c;
- STRLEN ulen;
- ulen = 1;
- if (hasutf)
- c = utf8_to_uv(s, send - s, &ulen, 0);
- else
- c = *s;
- if (c < 0x100 && tbl[c] >= 0)
+ if (!SvUTF8(sv))
+ while (s < send) {
+ if (tbl[*s++] >= 0)
matches++;
- s += ulen;
- }
- }
+ }
+ else
+ while (s < send) {
+ UV c;
+ STRLEN ulen;
+ c = utf8_to_uv(s, send - s, &ulen, 0);
+ if (c < 0x100 && tbl[c] >= 0)
+ matches++;
+ s += ulen;
+ }
return matches;
}
@@ -139,7 +125,7 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */
U8 *send;
U8 *d;
U8 *dstart;
- I32 hasutf = SvUTF8(sv);
+ I32 isutf8;
I32 matches = 0;
STRLEN len;
short *tbl;
@@ -150,64 +136,96 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */
Perl_croak(aTHX_ "panic: do_trans_complex");
s = (U8*)SvPV(sv, len);
+ isutf8 = SvUTF8(sv);
send = s + len;
- Newz(0, d, len*2+1, U8);
- dstart = d;
-
- if (PL_op->op_private & OPpTRANS_SQUASH) {
- U8* p = send;
-
- while (s < send) {
- if (hasutf && *s & 0x80)
- s += UTF8SKIP(s);
- else {
- if ((ch = tbl[*s]) >= 0) {
+ if (!isutf8) {
+ dstart = d = s;
+ if (PL_op->op_private & OPpTRANS_SQUASH) {
+ U8* p = send;
+ while (s < send) {
+ if ((ch = tbl[*s]) >= 0) {
*d = ch;
matches++;
- if (p != d - 1 || *p != *d)
- p = d++;
- }
- else if (ch == -1) /* -1 is unmapped character */
- *d++ = *s; /* -2 is delete character */
- s++;
- }
+ if (p != d - 1 || *p != *d)
+ p = d++;
+ }
+ else if (ch == -1) /* -1 is unmapped character */
+ *d++ = *s;
+ else if (ch == -2) /* -2 is delete character */
+ matches++;
+ s++;
+ }
}
+ else {
+ while (s < send) {
+ if ((ch = tbl[*s]) >= 0) {
+ matches++;
+ *d++ = ch;
+ }
+ else if (ch == -1) /* -1 is unmapped character */
+ *d++ = *s;
+ else if (ch == -2) /* -2 is delete character */
+ matches++;
+ s++;
+ }
+ }
+ SvCUR_set(sv, d - dstart);
}
- else {
- while (s < send) {
- UV comp;
- if (hasutf && *s & 0x80)
- comp = utf8_to_uv_simple(s, NULL);
- else
- comp = *s;
-
- ch = tbl[comp];
-
- if (ch == -1) { /* -1 is unmapped character */
- ch = comp;
- matches--;
- }
-
- if (ch >= 0)
- d = uv_to_utf8(d, ch);
-
- matches++;
-
- s += hasutf && *s & 0x80 ? UNISKIP(*s) : 1;
-
+ else { /* isutf8 */
+ Newz(0, d, len*2+1, U8);
+ dstart = d;
+
+ if (PL_op->op_private & OPpTRANS_SQUASH) {
+ U8* p = send;
+ UV pch = 0xfeedface;
+ while (s < send) {
+ STRLEN len;
+ UV comp = utf8_to_uv_simple(s, &len);
+
+ if (comp > 0xff)
+ d = uv_to_utf8(d, comp); /* always unmapped */
+ else if ((ch = tbl[comp]) >= 0) {
+ matches++;
+ if (ch != pch) {
+ d = uv_to_utf8(d, ch);
+ pch = ch;
+ }
+ s += len;
+ continue;
+ }
+ else if (ch == -1) /* -1 is unmapped character */
+ d = uv_to_utf8(d, comp);
+ else if (ch == -2) /* -2 is delete character */
+ matches++;
+ s += len;
+ pch = 0xfeedface;
+ }
}
+ else {
+ while (s < send) {
+ STRLEN len;
+ UV comp = utf8_to_uv_simple(s, &len);
+ if (comp > 0xff)
+ d = uv_to_utf8(d, comp); /* always unmapped */
+ else if ((ch = tbl[comp]) >= 0) {
+ d = uv_to_utf8(d, ch);
+ matches++;
+ }
+ else if (ch == -1) { /* -1 is unmapped character */
+ d = uv_to_utf8(d, comp);
+ }
+ else if (ch == -2) /* -2 is delete character */
+ matches++;
+ s += len;
+ }
+ }
+ *d = '\0';
+ sv_setpvn(sv, (char*)dstart, d - dstart);
+ SvUTF8_on(sv);
}
-
- *d = '\0';
-
- sv_setpvn(sv, (const char*)dstart, d - dstart);
- Safefree(dstart);
- if (hasutf)
- SvUTF8_on(sv);
SvSETMAGIC(sv);
return matches;
-
}
STATIC I32
@@ -217,7 +235,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
U8 *send;
U8 *d;
U8 *start;
- U8 *dstart;
+ U8 *dstart, *dend;
I32 matches = 0;
STRLEN len;
@@ -228,11 +246,19 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
UV extra = none + 1;
UV final;
UV uv;
- I32 isutf;
- I32 howmany;
+ I32 isutf8;
+ U8 hibit = 0;
- isutf = SvUTF8(sv);
s = (U8*)SvPV(sv, len);
+ isutf8 = SvUTF8(sv);
+ if (!isutf8) {
+ U8 *t = s, *e = s + len;
+ while (t < e)
+ if ((hibit = *t++ & 0x80))
+ break;
+ if (hibit)
+ s = bytes_to_utf8(s, &len);
+ }
send = s + len;
start = s;
@@ -241,41 +267,46 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
final = SvUV(*svp);
/* d needs to be bigger than s, in case e.g. upgrading is required */
- Newz(0, d, len*2+1, U8);
+ New(0, d, len*3+UTF8_MAXLEN, U8);
+ dend = d + len * 3;
dstart = d;
+
while (s < send) {
if ((uv = swash_fetch(rv, s)) < none) {
s += UTF8SKIP(s);
matches++;
- if ((uv & 0x80) && !isutf++)
- HALF_UTF8_UPGRADE(dstart,d);
d = uv_to_utf8(d, uv);
}
else if (uv == none) {
- int i;
- i = UTF8SKIP(s);
- if (i > 1 && !isutf++)
- HALF_UTF8_UPGRADE(dstart,d);
+ int i = UTF8SKIP(s);
while(i--)
*d++ = *s++;
}
else if (uv == extra) {
- int i;
- i = UTF8SKIP(s);
+ int i = UTF8SKIP(s);
s += i;
matches++;
- if (i > 1 && !isutf++)
- HALF_UTF8_UPGRADE(dstart,d);
d = uv_to_utf8(d, final);
}
else
s += UTF8SKIP(s);
+
+ if (d >= dend) {
+ STRLEN clen = d - dstart;
+ STRLEN nlen = dend - dstart + len + UTF8_MAXLEN;
+ Renew(dstart, nlen+UTF8_MAXLEN, U8);
+ d = dstart + clen;
+ dend = dstart + nlen;
+ }
}
*d = '\0';
- sv_setpvn(sv, (const char*)dstart, d - dstart);
+ sv_setpvn(sv, (char*)dstart, d - dstart);
SvSETMAGIC(sv);
- if (isutf)
- SvUTF8_on(sv);
+ SvUTF8_on(sv);
+ if (hibit)
+ Safefree(start);
+ if (!isutf8 && !(PL_hints & HINT_UTF8))
+ sv_utf8_downgrade(sv, TRUE);
return matches;
}
@@ -284,7 +315,7 @@ STATIC I32
S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */
{
U8 *s;
- U8 *send;
+ U8 *start, *send;
I32 matches = 0;
STRLEN len;
@@ -293,10 +324,17 @@ S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */
SV** svp = hv_fetch(hv, "NONE", 4, FALSE);
UV none = svp ? SvUV(*svp) : 0x7fffffff;
UV uv;
+ U8 hibit = 0;
s = (U8*)SvPV(sv, len);
- if (!SvUTF8(sv))
- s = bytes_to_utf8(s, &len);
+ if (!SvUTF8(sv)) {
+ U8 *t = s, *e = s + len;
+ while (t < e)
+ if ((hibit = *t++ & 0x80))
+ break;
+ if (hibit)
+ start = s = bytes_to_utf8(s, &len);
+ }
send = s + len;
while (s < send) {
@@ -304,6 +342,8 @@ S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */
matches++;
s += UTF8SKIP(s);
}
+ if (hibit)
+ Safefree(start);
return matches;
}
@@ -312,7 +352,7 @@ STATIC I32
S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
{
U8 *s;
- U8 *send;
+ U8 *start, *send;
U8 *d;
I32 matches = 0;
I32 squash = PL_op->op_private & OPpTRANS_SQUASH;
@@ -325,41 +365,45 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
UV final;
UV uv;
STRLEN len;
- U8 *dst;
- I32 isutf = SvUTF8(sv);
+ U8 *dstart, *dend;
+ I32 isutf8;
+ U8 hibit = 0;
s = (U8*)SvPV(sv, len);
+ isutf8 = SvUTF8(sv);
+ if (!isutf8) {
+ U8 *t = s, *e = s + len;
+ while (t < e)
+ if ((hibit = *t++ & 0x80))
+ break;
+ if (hibit)
+ s = bytes_to_utf8(s, &len);
+ }
send = s + len;
+ start = s;
svp = hv_fetch(hv, "FINAL", 5, FALSE);
if (svp)
final = SvUV(*svp);
- Newz(0, d, len*2+1, U8);
- dst = d;
+ New(0, d, len*3+UTF8_MAXLEN, U8);
+ dend = d + len * 3;
+ dstart = d;
if (squash) {
UV puv = 0xfeedface;
while (s < send) {
- if (SvUTF8(sv))
- uv = swash_fetch(rv, s);
- else {
- U8 tmpbuf[2];
- uv = *s++;
- if (uv < 0x80)
- tmpbuf[0] = uv;
- else {
- tmpbuf[0] = (( uv >> 6) | 0xc0);
- tmpbuf[1] = (( uv & 0x3f) | 0x80);
- }
- uv = swash_fetch(rv, tmpbuf);
+ uv = swash_fetch(rv, s);
+
+ if (d >= dend) {
+ STRLEN clen = d - dstart, nlen = dend - dstart + len;
+ Renew(dstart, nlen+UTF8_MAXLEN, U8);
+ d = dstart + clen;
+ dend = dstart + nlen;
}
-
if (uv < none) {
matches++;
if (uv != puv) {
- if ((uv & 0x80) && !isutf++)
- HALF_UTF8_UPGRADE(dst,d);
d = uv_to_utf8(d, uv);
puv = uv;
}
@@ -367,9 +411,9 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
continue;
}
else if (uv == none) { /* "none" is unmapped character */
- STRLEN ulen;
- *d++ = (U8)utf8_to_uv(s, send - s, &ulen, 0);
- s += ulen;
+ int i = UTF8SKIP(s);
+ while(i--)
+ *d++ = *s++;
puv = 0xfeedface;
continue;
}
@@ -388,18 +432,12 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
}
else {
while (s < send) {
- if (SvUTF8(sv))
- uv = swash_fetch(rv, s);
- else {
- U8 tmpbuf[2];
- uv = *s++;
- if (uv < 0x80)
- tmpbuf[0] = uv;
- else {
- tmpbuf[0] = (( uv >> 6) | 0xc0);
- tmpbuf[1] = (( uv & 0x3f) | 0x80);
- }
- uv = swash_fetch(rv, tmpbuf);
+ uv = swash_fetch(rv, s);
+ if (d >= dend) {
+ STRLEN clen = d - dstart, nlen = dend - dstart + len;
+ Renew(dstart, nlen+UTF8_MAXLEN, U8);
+ d = dstart + clen;
+ dend = dstart + nlen;
}
if (uv < none) {
matches++;
@@ -408,9 +446,9 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
continue;
}
else if (uv == none) { /* "none" is unmapped character */
- STRLEN ulen;
- *d++ = (U8)utf8_to_uv(s, send - s, &ulen, 0);
- s += ulen;
+ int i = UTF8SKIP(s);
+ while(i--)
+ *d++ = *s++;
continue;
}
else if (uv == extra && !del) {
@@ -423,12 +461,13 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
s += UTF8SKIP(s);
}
}
- if (dst)
- sv_usepvn(sv, (char*)dst, d - dst);
- else {
- *d = '\0';
- SvCUR_set(sv, d - (U8*)SvPVX(sv));
- }
+ *d = '\0';
+ sv_setpvn(sv, (char*)dstart, d - dstart);
+ SvUTF8_on(sv);
+ if (hibit)
+ Safefree(start);
+ if (!isutf8 && !(PL_hints & HINT_UTF8))
+ sv_utf8_downgrade(sv, TRUE);
SvSETMAGIC(sv);
return matches;