diff options
author | Karl Williamson <public@khwilliamson.com> | 2010-11-12 09:05:19 -0700 |
---|---|---|
committer | Father Chrysostomos <sprout@cpan.org> | 2010-11-22 13:32:51 -0800 |
commit | d02f4dad561ba81f934560f8eab051147a45e09e (patch) | |
tree | 55beb313f848843ad63aca8c05d7162b04686b82 /utfebcdic.h | |
parent | 6d8e7a01cde4803f9769fb51a041fbd8a839e0b2 (diff) | |
download | perl-d02f4dad561ba81f934560f8eab051147a45e09e.tar.gz |
PL_fold wrong for EBCDIC platforms.
The PL_fold table map on EBCDIC only works on the ASCII-subrange
characters, not the full native Latin1.
To fix this, I moved the table to utfebcdic.h for EBCDIC platforms, and
actually changed it to three tables, one for each of the code pages
known to Perl.
There is no EBCDIC platform available to test on. What I did was hack
together a program from existing code that does EBCDIC transforms. I
ran it in ASCII mode, and verified that the generated table was
identical to the Latin1 table I had previously constructed by hand and
extensively tested. I then ran it on each of the three EBCDIC
transforms, and verified that each matched the places in the original
table that I knew were correct, all the ASCII alphabetics, the controls,
and a few other code points.
So these tables are at least as correct as the existing one, as they are
identical to it for [A-Z], [a-z].
Diffstat (limited to 'utfebcdic.h')
-rw-r--r-- | utfebcdic.h | 113 |
1 files changed, 113 insertions, 0 deletions
diff --git a/utfebcdic.h b/utfebcdic.h index c3fe6036ee..2fb5b9e4e2 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -317,6 +317,42 @@ EXTCONST unsigned char PL_e2a[] = { /* EBCDIC (IBM-1047) to ASCII (iso-8859-1) * 0x5C, 0xF7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0xB2, 0xD4, 0xD6, 0xD2, 0xD3, 0xD5, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xB3, 0xDB, 0xDC, 0xD9, 0xDA, 0x9F }; + +EXTCONST unsigned char PL_fold[] = { /* fast EBCDIC case folding table, 'A' => + 'a'; 'a' => 'A' */ + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 98, 99, 100, 101, 102, 103, + 104, 105, 74, 75, 76, 77, 78, 79, + 80, 113, 114, 115, 116, 117, 118, 119, + 120, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 66, 67, 68, 69, 70, 71, + 72, 73, 106, 107, 108, 109, 110, 111, + 128, 81, 82, 83, 84, 85, 86, 87, + 88, 121, 122, 123, 124, 125, 126, 127, + 112, 'A', 'B', 'C', 'D', 'E', 'F', 'G', + 'H', 'I', 138, 139, 172, 186, 174, 143, + 144, 'J', 'K', 'L', 'M', 'N', 'O', 'P', + 'Q', 'R', 154, 155, 158, 157, 156, 159, + 160, 161, 'S', 'T', 'U', 'V', 'W', 'X', + 'Y', 'Z', 170, 171, 140, 173, 142, 175, + 176, 177, 178, 179, 180, 181, 182, 183, + 184, 185, 141, 187, 188, 189, 190, 191, + 192, 'a', 'b', 'c', 'd', 'e', 'f', 'g', + 'h', 'i', 202, 235, 236, 237, 238, 239, + 208, 'j', 'k', 'l', 'm', 'n', 'o', 'p', + 'q', 'r', 218, 251, 252, 253, 254, 223, + 224, 225, 's', 't', 'u', 'v', 'w', 'x', + 'y', 'z', 234, 203, 204, 205, 206, 207, + 240, 241, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 219, 220, 221, 222, 255 +}; #endif /* 1047 */ #if '^' == 106 /* if defined(_OSD_POSIX) POSIX-BC */ @@ -361,6 +397,42 @@ EXTCONST unsigned char PL_e2a[] = { /* EBCDIC (POSIX-BC) to ASCII (ISO8859-1) */ 0xD9, 0xF7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0xB2, 0xD4, 0xD6, 0xD2, 0xD3, 0xD5, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xB3, 0x7B, 0xDC, 0x7D, 0xDA, 0x7E }; + +EXTCONST unsigned char PL_fold[] = { /* fast EBCDIC case folding table, 'A' => + 'a'; 'a' => 'A' */ + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 98, 99, 100, 101, 102, 103, + 104, 105, 74, 75, 76, 77, 78, 79, + 80, 113, 114, 115, 116, 117, 118, 119, + 120, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 66, 67, 68, 69, 70, 71, + 72, 73, 106, 107, 108, 109, 110, 111, + 128, 81, 82, 83, 84, 85, 86, 87, + 88, 121, 122, 123, 124, 125, 126, 127, + 112, 'A', 'B', 'C', 'D', 'E', 'F', 'G', + 'H', 'I', 138, 139, 172, 173, 174, 143, + 144, 'J', 'K', 'L', 'M', 'N', 'O', 'P', + 'Q', 'R', 154, 155, 158, 157, 156, 159, + 160, 161, 'S', 'T', 'U', 'V', 'W', 'X', + 'Y', 'Z', 170, 171, 140, 141, 142, 175, + 176, 177, 178, 179, 180, 181, 182, 183, + 184, 185, 186, 187, 188, 189, 190, 191, + 224, 'a', 'b', 'c', 'd', 'e', 'f', 'g', + 'h', 'i', 202, 235, 236, 237, 238, 239, + 208, 'j', 'k', 'l', 'm', 'n', 'o', 'p', + 'q', 'r', 218, 221, 252, 219, 254, 223, + 192, 225, 's', 't', 'u', 'v', 'w', 'x', + 'y', 'z', 234, 203, 204, 205, 206, 207, + 240, 241, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 220, 253, 222, 255 +}; #endif /* POSIX-BC */ #if '^' == 176 /* if defined(??) (OS/400?) 037 */ @@ -406,6 +478,42 @@ EXTCONST unsigned char PL_e2a[] = { /* EBCDIC (IBM-037) to ASCII (ISO8859-1) */ 0x5C, 0xF7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0xB2, 0xD4, 0xD6, 0xD2, 0xD3, 0xD5, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xB3, 0xDB, 0xDC, 0xD9, 0xDA, 0x9F }; + +EXTCONST unsigned char PL_fold[] = { /* fast EBCDIC case folding table, 'A' => + 'a'; 'a' => 'A' */ + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 98, 99, 100, 101, 102, 103, + 104, 105, 74, 75, 76, 77, 78, 79, + 80, 113, 114, 115, 116, 117, 118, 119, + 120, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 66, 67, 68, 69, 70, 71, + 72, 73, 106, 107, 108, 109, 110, 111, + 128, 81, 82, 83, 84, 85, 86, 87, + 88, 121, 122, 123, 124, 125, 126, 127, + 112, 'A', 'B', 'C', 'D', 'E', 'F', 'G', + 'H', 'I', 138, 139, 172, 173, 174, 143, + 144, 'J', 'K', 'L', 'M', 'N', 'O', 'P', + 'Q', 'R', 154, 155, 158, 157, 156, 159, + 160, 161, 'S', 'T', 'U', 'V', 'W', 'X', + 'Y', 'Z', 170, 171, 140, 141, 142, 175, + 176, 177, 178, 179, 180, 181, 182, 183, + 184, 185, 186, 187, 188, 189, 190, 191, + 192, 'a', 'b', 'c', 'd', 'e', 'f', 'g', + 'h', 'i', 202, 235, 236, 237, 238, 239, + 208, 'j', 'k', 'l', 'm', 'n', 'o', 'p', + 'q', 'r', 218, 251, 252, 253, 254, 223, + 224, 225, 's', 't', 'u', 'v', 'w', 'x', + 'y', 'z', 234, 203, 204, 205, 206, 207, + 240, 241, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 219, 220, 221, 222, 255 +}; #endif /* 037 */ #else @@ -414,8 +522,13 @@ EXTCONST unsigned char PL_e2utf[]; EXTCONST unsigned char PL_utf2e[]; EXTCONST unsigned char PL_e2a[]; EXTCONST unsigned char PL_a2e[]; +EXTCONST unsigned char PL_fold[]; #endif +/* Since the EBCDIC code pages are isomorphic to Latin1, that table is merely a + * duplicate */ +EXTCONST unsigned char * PL_fold_latin1 = PL_fold; + END_EXTERN_C /* EBCDIC-happy ways of converting native code to UTF-8 */ |