summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Prymmer <PPrymmer@factset.com>2001-03-08 08:23:25 -0800
committerJarkko Hietaniemi <jhi@iki.fi>2001-03-09 01:01:27 +0000
commit3bd709b1a63d554f3d98d5394be78ed628eb46da (patch)
treeb8add48b769c1a6db079b5276eb469d4c75a1ef4
parent538c41fbfff9c31ee1c8c40096f132b1ea496531 (diff)
downloadperl-3bd709b1a63d554f3d98d5394be78ed628eb46da.tar.gz
Re: Unicode/EBCDIC
Message-ID: <Pine.OSF.4.10.10103081617390.377472-100000@aspara.forte.com> p4raw-id: //depot/perl@9082
-rw-r--r--perl.c6
-rw-r--r--perl.h216
-rw-r--r--sv.c4
-rw-r--r--toke.c10
-rw-r--r--utf8.c4
-rw-r--r--utf8.h19
6 files changed, 251 insertions, 8 deletions
diff --git a/perl.c b/perl.c
index 03b2ec1118..0920a41d33 100644
--- a/perl.c
+++ b/perl.c
@@ -253,9 +253,9 @@ perl_construct(pTHXx)
if (PERL_REVISION > 127 || PERL_VERSION > 127 || PERL_SUBVERSION > 127)
SvGROW(PL_patchlevel, UTF8_MAXLEN*3+1);
s = (U8*)SvPVX(PL_patchlevel);
- s = uv_to_utf8(s, (UV)PERL_REVISION);
- s = uv_to_utf8(s, (UV)PERL_VERSION);
- s = uv_to_utf8(s, (UV)PERL_SUBVERSION);
+ s = uv_to_utf8(s, (UV)(ASCII_TO_NATIVE(PERL_REVISION)));
+ s = uv_to_utf8(s, (UV)(ASCII_TO_NATIVE(PERL_VERSION)));
+ s = uv_to_utf8(s, (UV)(ASCII_TO_NATIVE(PERL_SUBVERSION)));
*s = '\0';
SvCUR_set(PL_patchlevel, s - (U8*)SvPVX(PL_patchlevel));
SvPOK_on(PL_patchlevel);
diff --git a/perl.h b/perl.h
index 2b66473837..4ee33cc4fa 100644
--- a/perl.h
+++ b/perl.h
@@ -2436,10 +2436,220 @@ EXT char *PL_sig_name[];
EXT int PL_sig_num[];
#endif
-/* fast case folding tables */
+/* fast conversion and case folding tables */
#ifdef DOINIT
#ifdef EBCDIC
+#if '^' == 106 /* if defined(_OSD_POSIX) POSIX-BC */
+EXT unsigned char PL_e2a[] = { /* ASCII (ISO8859-1) to EBCDIC (POSIX-BC) */
+ 0, 1, 2, 3, 55, 45, 46, 47,
+ 22, 5, 21, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 60, 61, 50, 38,
+ 24, 25, 63, 39, 28, 29, 30, 31,
+ 64, 90, 127, 123, 91, 108, 80, 125,
+ 77, 93, 92, 78, 107, 96, 75, 97,
+ 240, 241, 242, 243, 244, 245, 246, 247,
+ 248, 249, 122, 94, 76, 126, 110, 111,
+ 124, 193, 194, 195, 196, 197, 198, 199,
+ 200, 201, 209, 210, 211, 212, 213, 214,
+ 215, 216, 217, 226, 227, 228, 229, 230,
+ 231, 232, 233, 187, 188, 189, 106, 109,
+ 74, 129, 130, 131, 132, 133, 134, 135,
+ 136, 137, 145, 146, 147, 148, 149, 150,
+ 151, 152, 153, 162, 163, 164, 165, 166,
+ 167, 168, 169, 251, 79, 253, 255, 7,
+ 32, 33, 34, 35, 36, 37, 6, 23,
+ 40, 41, 42, 43, 44, 9, 10, 27,
+ 48, 49, 26, 51, 52, 53, 54, 8,
+ 56, 57, 58, 59, 4, 20, 62, 95,
+ 65, 170, 176, 177, 159, 178, 208, 181,
+ 121, 180, 154, 138, 186, 202, 175, 161,
+ 144, 143, 234, 250, 190, 160, 182, 179,
+ 157, 218, 155, 139, 183, 184, 185, 171,
+ 100, 101, 98, 102, 99, 103, 158, 104,
+ 116, 113, 114, 115, 120, 117, 118, 119,
+ 172, 105, 237, 238, 235, 239, 236, 191,
+ 128, 224, 254, 221, 252, 173, 174, 89,
+ 68, 69, 66, 70, 67, 71, 156, 72,
+ 84, 81, 82, 83, 88, 85, 86, 87,
+ 140, 73, 205, 206, 203, 207, 204, 225,
+ 112, 192, 222, 219, 220, 141, 142, 223
+};
+EXT unsigned char PL_a2e[] = { /* EBCDIC (POSIX-BC) to ASCII (ISO8859-1) */
+ 0, 1, 2, 3, 156, 9, 134, 127,
+ 151, 141, 142, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 157, 10, 8, 135,
+ 24, 25, 146, 143, 28, 29, 30, 31,
+ 128, 129, 130, 131, 132, 133, 23, 27,
+ 136, 137, 138, 139, 140, 5, 6, 7,
+ 144, 145, 22, 147, 148, 149, 150, 4,
+ 152, 153, 154, 155, 20, 21, 158, 26,
+ 32, 160, 226, 228, 224, 225, 227, 229,
+ 231, 241, 96, 46, 60, 40, 43, 124,
+ 38, 233, 234, 235, 232, 237, 238, 239,
+ 236, 223, 33, 36, 42, 41, 59, 159,
+ 45, 47, 194, 196, 192, 193, 195, 197,
+ 199, 209, 94, 44, 37, 95, 62, 63,
+ 248, 201, 202, 203, 200, 205, 206, 207,
+ 204, 168, 58, 35, 64, 39, 61, 34,
+ 216, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 171, 187, 240, 253, 254, 177,
+ 176, 106, 107, 108, 109, 110, 111, 112,
+ 113, 114, 170, 186, 230, 184, 198, 164,
+ 181, 175, 115, 116, 117, 118, 119, 120,
+ 121, 122, 161, 191, 208, 221, 222, 174,
+ 162, 163, 165, 183, 169, 167, 182, 188,
+ 189, 190, 172, 91, 92, 93, 180, 215,
+ 249, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 173, 244, 246, 242, 243, 245,
+ 166, 74, 75, 76, 77, 78, 79, 80,
+ 81, 82, 185, 251, 252, 219, 250, 255,
+ 217, 247, 83, 84, 85, 86, 87, 88,
+ 89, 90, 178, 212, 214, 210, 211, 213,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 179, 123, 220, 125, 218, 126
+};
+#endif /* POSIX-BC */
+#if '^' == 176 /* if defined(??) (OS/400?) 037 */
+EXT unsigned char PL_e2a[] = { /* ASCII (ISO8859-1) to EBCDIC (IBM-037) */
+ 0, 1, 2, 3, 55, 45, 46, 47,
+ 22, 5, 37, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 60, 61, 50, 38,
+ 24, 25, 63, 39, 28, 29, 30, 31,
+ 64, 90, 127, 123, 91, 108, 80, 125,
+ 77, 93, 92, 78, 107, 96, 75, 97,
+ 240, 241, 242, 243, 244, 245, 246, 247,
+ 248, 249, 122, 94, 76, 126, 110, 111,
+ 124, 193, 194, 195, 196, 197, 198, 199,
+ 200, 201, 209, 210, 211, 212, 213, 214,
+ 215, 216, 217, 226, 227, 228, 229, 230,
+ 231, 232, 233, 186, 224, 187, 176, 109,
+ 121, 129, 130, 131, 132, 133, 134, 135,
+ 136, 137, 145, 146, 147, 148, 149, 150,
+ 151, 152, 153, 162, 163, 164, 165, 166,
+ 167, 168, 169, 192, 79, 208, 161, 7,
+ 32, 33, 34, 35, 36, 21, 6, 23,
+ 40, 41, 42, 43, 44, 9, 10, 27,
+ 48, 49, 26, 51, 52, 53, 54, 8,
+ 56, 57, 58, 59, 4, 20, 62, 255,
+ 65, 170, 74, 177, 159, 178, 106, 181,
+ 189, 180, 154, 138, 95, 202, 175, 188,
+ 144, 143, 234, 250, 190, 160, 182, 179,
+ 157, 218, 155, 139, 183, 184, 185, 171,
+ 100, 101, 98, 102, 99, 103, 158, 104,
+ 116, 113, 114, 115, 120, 117, 118, 119,
+ 172, 105, 237, 238, 235, 239, 236, 191,
+ 128, 253, 254, 251, 252, 173, 174, 89,
+ 68, 69, 66, 70, 67, 71, 156, 72,
+ 84, 81, 82, 83, 88, 85, 86, 87,
+ 140, 73, 205, 206, 203, 207, 204, 225,
+ 112, 221, 222, 219, 220, 141, 142, 223
+};
+EXT unsigned char PL_a2e[] = { /* EBCDIC (IBM-037) to ASCII (ISO8859-1) */
+ 0, 1, 2, 3, 156, 9, 134, 127,
+ 151, 141, 142, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 157, 133, 8, 135,
+ 24, 25, 146, 143, 28, 29, 30, 31,
+ 128, 129, 130, 131, 132, 10, 23, 27,
+ 136, 137, 138, 139, 140, 5, 6, 7,
+ 144, 145, 22, 147, 148, 149, 150, 4,
+ 152, 153, 154, 155, 20, 21, 158, 26,
+ 32, 160, 226, 228, 224, 225, 227, 229,
+ 231, 241, 162, 46, 60, 40, 43, 124,
+ 38, 233, 234, 235, 232, 237, 238, 239,
+ 236, 223, 33, 36, 42, 41, 59, 172,
+ 45, 47, 194, 196, 192, 193, 195, 197,
+ 199, 209, 166, 44, 37, 95, 62, 63,
+ 248, 201, 202, 203, 200, 205, 206, 207,
+ 204, 96, 58, 35, 64, 39, 61, 34,
+ 216, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 171, 187, 240, 253, 254, 177,
+ 176, 106, 107, 108, 109, 110, 111, 112,
+ 113, 114, 170, 186, 230, 184, 198, 164,
+ 181, 126, 115, 116, 117, 118, 119, 120,
+ 121, 122, 161, 191, 208, 221, 222, 174,
+ 94, 163, 165, 183, 169, 167, 182, 188,
+ 189, 190, 91, 93, 175, 168, 180, 215,
+ 123, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 173, 244, 246, 242, 243, 245,
+ 125, 74, 75, 76, 77, 78, 79, 80,
+ 81, 82, 185, 251, 252, 249, 250, 255,
+ 92, 247, 83, 84, 85, 86, 87, 88,
+ 89, 90, 178, 212, 214, 210, 211, 213,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 179, 219, 220, 217, 218, 159
+};
+#endif /* 037 */
+#if '^' == 95 /* if defined(__MVS__) || defined(??) (VM/ESA?) 1047 */
+EXT unsigned char PL_e2a[] = { /* ASCII (ISO8859-1) to EBCDIC (IBM-1047) */
+ 0, 1, 2, 3, 55, 45, 46, 47,
+ 22, 5, 21, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 60, 61, 50, 38,
+ 24, 25, 63, 39, 28, 29, 30, 31,
+ 64, 90, 127, 123, 91, 108, 80, 125,
+ 77, 93, 92, 78, 107, 96, 75, 97,
+ 240, 241, 242, 243, 244, 245, 246, 247,
+ 248, 249, 122, 94, 76, 126, 110, 111,
+ 124, 193, 194, 195, 196, 197, 198, 199,
+ 200, 201, 209, 210, 211, 212, 213, 214,
+ 215, 216, 217, 226, 227, 228, 229, 230,
+ 231, 232, 233, 173, 224, 189, 95, 109,
+ 121, 129, 130, 131, 132, 133, 134, 135,
+ 136, 137, 145, 146, 147, 148, 149, 150,
+ 151, 152, 153, 162, 163, 164, 165, 166,
+ 167, 168, 169, 192, 79, 208, 161, 7,
+ 32, 33, 34, 35, 36, 37, 6, 23,
+ 40, 41, 42, 43, 44, 9, 10, 27,
+ 48, 49, 26, 51, 52, 53, 54, 8,
+ 56, 57, 58, 59, 4, 20, 62, 255,
+ 65, 170, 74, 177, 159, 178, 106, 181,
+ 187, 180, 154, 138, 176, 202, 175, 188,
+ 144, 143, 234, 250, 190, 160, 182, 179,
+ 157, 218, 155, 139, 183, 184, 185, 171,
+ 100, 101, 98, 102, 99, 103, 158, 104,
+ 116, 113, 114, 115, 120, 117, 118, 119,
+ 172, 105, 237, 238, 235, 239, 236, 191,
+ 128, 253, 254, 251, 252, 186, 174, 89,
+ 68, 69, 66, 70, 67, 71, 156, 72,
+ 84, 81, 82, 83, 88, 85, 86, 87,
+ 140, 73, 205, 206, 203, 207, 204, 225,
+ 112, 221, 222, 219, 220, 141, 142, 223
+};
+EXT unsigned char PL_a2e[] = { /* EBCDIC (IBM-1047) to ASCII (ISO8859-1) */
+ 0, 1, 2, 3, 156, 9, 134, 127,
+ 151, 141, 142, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 157, 10, 8, 135,
+ 24, 25, 146, 143, 28, 29, 30, 31,
+ 128, 129, 130, 131, 132, 133, 23, 27,
+ 136, 137, 138, 139, 140, 5, 6, 7,
+ 144, 145, 22, 147, 148, 149, 150, 4,
+ 152, 153, 154, 155, 20, 21, 158, 26,
+ 32, 160, 226, 228, 224, 225, 227, 229,
+ 231, 241, 162, 46, 60, 40, 43, 124,
+ 38, 233, 234, 235, 232, 237, 238, 239,
+ 236, 223, 33, 36, 42, 41, 59, 94,
+ 45, 47, 194, 196, 192, 193, 195, 197,
+ 199, 209, 166, 44, 37, 95, 62, 63,
+ 248, 201, 202, 203, 200, 205, 206, 207,
+ 204, 96, 58, 35, 64, 39, 61, 34,
+ 216, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 171, 187, 240, 253, 254, 177,
+ 176, 106, 107, 108, 109, 110, 111, 112,
+ 113, 114, 170, 186, 230, 184, 198, 164,
+ 181, 126, 115, 116, 117, 118, 119, 120,
+ 121, 122, 161, 191, 208, 91, 222, 174,
+ 172, 163, 165, 183, 169, 167, 182, 188,
+ 189, 190, 221, 168, 175, 93, 180, 215,
+ 123, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 173, 244, 246, 242, 243, 245,
+ 125, 74, 75, 76, 77, 78, 79, 80,
+ 81, 82, 185, 251, 252, 249, 250, 255,
+ 92, 247, 83, 84, 85, 86, 87, 88,
+ 89, 90, 178, 212, 214, 210, 211, 213,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 179, 219, 220, 217, 218, 159
+};
+#endif /* 1047 */
EXT unsigned char PL_fold[] = { /* fast EBCDIC case folding table */
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
@@ -2512,6 +2722,10 @@ EXTCONST unsigned char PL_fold[] = {
#endif /* !EBCDIC */
#else
EXTCONST unsigned char PL_fold[];
+#ifdef EBCDIC
+EXTCONST unsigned char PL_e2a[];
+EXTCONST unsigned char PL_a2e[];
+#endif /* EBCDIC */
#endif
#ifdef DOINIT
diff --git a/sv.c b/sv.c
index d47a2f5924..20b4f2a393 100644
--- a/sv.c
+++ b/sv.c
@@ -7188,6 +7188,8 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
iv = *vecstr;
ulen = 1;
}
+ if (iv <256)
+ iv = NATIVE_TO_ASCII(iv); /* v-strings are codepoints */
vecstr += ulen;
veclen -= ulen;
}
@@ -7268,6 +7270,8 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
uv = *vecstr;
ulen = 1;
}
+ if (uv <256)
+ uv = NATIVE_TO_ASCII(uv); /* v-strings are codepoints */
vecstr += ulen;
veclen -= ulen;
}
diff --git a/toke.c b/toke.c
index 4e9020187d..2bb9282822 100644
--- a/toke.c
+++ b/toke.c
@@ -7273,6 +7273,10 @@ vstring:
"Integer overflow in decimal number");
}
}
+ /* THIS IS EVIL */
+ if (rev < 256)
+ rev = ASCII_TO_NATIVE(rev);
+
tmpend = uv_to_utf8(tmpbuf, rev);
if (rev > revmax)
revmax = rev;
@@ -7289,11 +7293,11 @@ vstring:
SvPOK_on(sv);
SvREADONLY_on(sv);
- if (revmax > 127) {
- SvUTF8_on(sv);
+ /* if (revmax > 127) { */
+ SvUTF8_on(sv); /*
if (revmax < 256)
sv_utf8_downgrade(sv, TRUE);
- }
+ } */
}
}
break;
diff --git a/utf8.c b/utf8.c
index f00659a986..55a8c7cba5 100644
--- a/utf8.c
+++ b/utf8.c
@@ -46,6 +46,8 @@ is the recommended Unicode-aware way of saying
U8 *
Perl_uv_to_utf8(pTHX_ U8 *d, UV uv)
{
+ if (uv < 0x100)
+ uv = NATIVE_TO_ASCII(uv);
if (uv < 0x80) {
*d++ = uv;
return d;
@@ -254,7 +256,7 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
if (UTF8_IS_ASCII(uv)) {
if (retlen)
*retlen = 1;
- return *s;
+ return ASCII_TO_NATIVE(*s);
}
if (UTF8_IS_CONTINUATION(uv) &&
diff --git a/utf8.h b/utf8.h
index 8b0c8c3bd4..49178115f0 100644
--- a/utf8.h
+++ b/utf8.h
@@ -131,3 +131,22 @@ END_EXTERN_C
#endif
#define isIDFIRST_lazy(p) isIDFIRST_lazy_if(p,1)
#define isALNUM_lazy(p) isALNUM_lazy_if(p,1)
+
+/* EBCDIC-happy ways of converting native code to UTF8; the reverse
+ process is taken care of in utf8_to_uv */
+
+#ifdef EBCDIC
+#define NATIVE_TO_ASCII(ch) PL_e2a[(ch)]
+#define ASCII_TO_NATIVE(ch) PL_a2e[(ch)]
+#else
+#define NATIVE_TO_ASCII(ch) (ch)
+#define ASCII_TO_NATIVE(ch) (ch)
+#endif
+
+#define UTF8_NEEDS_UPGRADE(ch) (NATIVE_TO_ASCII(ch) & 0x80)
+#define NATIVE_TO_UTF8(ch, string) STMT_START { \
+ if (!UTF8_NEEDS_UPGRADE(ch)) \
+ *(string)++ = NATIVE_TO_ASCII(ch); \
+ else /* uv_to_utf8 is EBCDIC-aware */ \
+ string = uv_to_utf8(string, ch); \
+ } STMT_END