diff options
author | Jarkko Hietaniemi <jhi@iki.fi> | 2002-02-13 00:24:37 +0000 |
---|---|---|
committer | Jarkko Hietaniemi <jhi@iki.fi> | 2002-02-13 00:24:37 +0000 |
commit | d2dcd0fb0e5b4b6b0e01e4cff08a37dff0d015ce (patch) | |
tree | 8afc773198a2130f2aa373e59ccd725cfeade17b /utf8.c | |
parent | 3275ba964c17f100af90c1175c6541dabcee6dbb (diff) | |
download | perl-d2dcd0fb0e5b4b6b0e01e4cff08a37dff0d015ce.tar.gz |
Rewrite the "special mapping" part of to_utf8_case(),
this time with fewer bugs. (See: The Law of Cybernetic
Entymology.)
p4raw-id: //depot/perl@14664
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 72 |
1 files changed, 46 insertions, 26 deletions
@@ -1303,36 +1303,56 @@ Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *norma (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv))) && (he = hv_fetch_ent(hv, keysv, FALSE, 0))) { SV *val = HeVAL(he); - char *s = SvPV(val, *lenp); - U8 c = *(U8*)s; + STRLEN len; + char *s = SvPV(val, len); - if (*lenp > 1 || UNI_IS_INVARIANT(c)) - Copy(s, ustrp, *lenp, U8); - else { - /* something in the 0x80..0xFF range */ - ustrp[0] = UTF8_EIGHT_BIT_HI(c); - ustrp[1] = UTF8_EIGHT_BIT_LO(c); - *lenp = 2; - } + if (len > 1) { + Copy(s, ustrp, len, U8); #ifdef EBCDIC - { - U8 tmpbuf[UTF8_MAXLEN_FOLD+1]; - U8 *d = tmpbuf; - U8 *t, *tend; - STRLEN tlen; - - for (t = ustrp, tend = t + *lenp; t < tend; t += tlen) { - UV c = utf8_to_uvchr(t, &tlen); - - if (tlen > 0) - d = uvchr_to_utf8(d, UNI_TO_NATIVE(c)); - else - break; + { + /* If we have EBCDIC we need to remap the + * characters coming in from the "special" + * (usually, but not always multicharacter) + * mapping, since any characters in the low 256 + * are in Unicode code points, not EBCDIC. + * If we either had a bit in the "special" + * mappings indicating "contains lower 256", + * or if we on EBCDIC platforms regenerate the + * lib/unicore/To/Foo.pl, we could do without + * this, but for now, let's do it this way. + * --jhi */ + + U8 tmpbuf[UTF8_MAXLEN_FOLD+1]; + U8 *d = tmpbuf; + U8 *t, *tend; + STRLEN tlen; + + for (t = ustrp, tend = t + len; t < tend; t += tlen) { + UV c = utf8_to_uvchr(t, &tlen); + + if (tlen > 0) + d = uvchr_to_utf8(d, UNI_TO_NATIVE(c)); + else + break; + } + len = d - tmpbuf; + Copy(tmpbuf, ustrp, len, U8); } - *lenp = d - tmpbuf; - Copy(tmpbuf, ustrp, *lenp, U8); - } #endif + } + else { + U8 c = UNI_TO_NATIVE(*s); + + if (NATIVE_IS_INVARIANT(c)) + ustrp[0] = c; + else { + ustrp[0] = UTF8_EIGHT_BIT_HI(c); + ustrp[1] = UTF8_EIGHT_BIT_LO(c); + len = 2; + } + } + if (lenp) + *lenp = len; return utf8_to_uvchr(ustrp, 0); } uv = NATIVE_TO_UNI(uv); |