summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2002-02-17 20:44:59 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2002-02-17 20:44:59 +0000
commit2a37f04dcbe9376a1280272d62501209825ac83c (patch)
treeaa95c4a8148871c097378036d91d4a9bc6580095 /utf8.c
parentf51e7ad5e98fec2428ec465e9ce55643d93c2291 (diff)
downloadperl-2a37f04dcbe9376a1280272d62501209825ac83c.tar.gz
Clearing up to_utf8_case() continues: this time use
a single return, and EBCDICification for all paths. p4raw-id: //depot/perl@14734
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c114
1 files changed, 58 insertions, 56 deletions
diff --git a/utf8.c b/utf8.c
index 314bbffe87..b33e3db36a 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1288,8 +1288,10 @@ UV
Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
{
UV uv0, uv1, uv2;
- U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
+ U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *d;
+ char *s = NULL;
STRLEN len;
+ bool has_utf8 = FALSE;
if (!*swashp)
*swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
@@ -1301,76 +1303,76 @@ Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *norma
uvuni_to_utf8(tmpbuf, uv1);
uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
if (uv2) {
- /* It was "normal" (single character mapping). */
- UV uv3 = UNI_TO_NATIVE(uv2);
-
- len = uvchr_to_utf8(ustrp, uv3) - ustrp;
- if (lenp)
- *lenp = len;
-
- return uv3;
+ /* It was "normal" (a single character mapping). */
+ d = uvuni_to_utf8(ustrp, uv2);
+ has_utf8 = !UNI_IS_INVARIANT(uv2);
}
else {
+ /* It might be "special" (sometimes, but not always,
+ * a multicharacter mapping) */
HV *hv;
SV *keysv;
HE *he;
+ SV *val;
if ((hv = get_hv(special, FALSE)) &&
(keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) &&
- (he = hv_fetch_ent(hv, keysv, FALSE, 0))) {
- SV *val = HeVAL(he);
- char *s = SvPV(val, len);
+ (he = hv_fetch_ent(hv, keysv, FALSE, 0)) &&
+ (val = HeVAL(he))) {
- if (len > 1) {
+ s = SvPV(val, len);
+ if (len == 1)
+ d = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s));
+ else {
Copy(s, ustrp, len, U8);
-#ifdef EBCDIC
- {
- /* If we have EBCDIC we need to remap the
- * characters coming in from the "special"
- * (usually, but not always multicharacter)
- * mapping, since any characters in the low 256
- * are in Unicode code points, not EBCDIC.
- * --jhi */
- U8 *d = tmpbuf;
- U8 *t, *tend;
-
- if (SvUTF8(val)) {
- STRLEN tlen = 0;
-
- for (t = ustrp, tend = t + len;
- t < tend; t += tlen) {
- UV c = utf8_to_uvchr(t, &tlen);
-
- if (tlen > 0)
- d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
- else
- break;
- }
- } else {
- for (t = ustrp, tend = t + len;
- t < tend; t++)
- d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
- }
- len = d - tmpbuf;
- Copy(tmpbuf, ustrp, len, U8);
- }
-#endif
+ d = ustrp + len;
}
- else
- len = uvchr_to_utf8(ustrp, UNI_TO_NATIVE(*(U8*)s)) - ustrp;
- if (lenp)
- *lenp = len;
-
- return utf8_to_uvchr(ustrp, 0);
+ if (SvUTF8(val))
+ has_utf8 = TRUE;
}
+ else {
+ /* It was not "special", either. */
+ d = uvuni_to_utf8(ustrp, uv1);
+ has_utf8 = !UNI_IS_INVARIANT(uv1);
+ }
+ }
- /* So it was not "special": just copy it. */
- len = uvchr_to_utf8(ustrp, uv0) - ustrp;
- if (lenp)
- *lenp = len;
+ len = d - ustrp;
- return uv0;
+#ifdef EBCDIC
+ {
+ /* If we have EBCDIC we need to remap the characters since
+ * any characters in the low 256 are in Unicode code points,
+ * not EBCDIC. */
+ U8 *t, *tend;
+
+ d = tmpbuf;
+ if (has_utf8) {
+ STRLEN tlen = 0;
+
+ for (t = ustrp, tend = t + len;
+ t < tend; t += tlen) {
+ UV c = utf8_to_uvchr(t, &tlen);
+
+ if (tlen > 0)
+ d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
+ else
+ break;
+ }
+ } else {
+ for (t = ustrp, tend = t + len;
+ t < tend; t++)
+ d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
+ }
+ len = d - tmpbuf;
+ Copy(tmpbuf, ustrp, len, U8);
}
+#endif
+
+ if (lenp)
+ *lenp = len;
+
+ return utf8_to_uvchr(ustrp, 0);
}
/*