diff options
author | Karl Williamson <public@khwilliamson.com> | 2013-02-17 13:47:13 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2013-08-29 09:55:52 -0600 |
commit | f4cd282cd75f0333d0ce591c83812eed609d5283 (patch) | |
tree | 5b2e6e4c08b6fe67d75a9d93dea211e71988c3ca | |
parent | 94bb8c36d9e11dd4825e43d06f0832f01a7e5045 (diff) | |
download | perl-f4cd282cd75f0333d0ce591c83812eed609d5283.tar.gz |
Remove EBCDIC remappings
Now that the Unicode tables are stored in native format, we shouldn't be
doing remapping.
Note that this assumes that the Latin1 casing tables are stored in
native order; not all of this has been done yet.
-rw-r--r-- | handy.h | 12 | ||||
-rw-r--r-- | perly.c | 5 | ||||
-rw-r--r-- | pp.c | 2 | ||||
-rw-r--r-- | regcomp.c | 20 | ||||
-rw-r--r-- | regexec.c | 6 | ||||
-rw-r--r-- | utf8.c | 99 |
6 files changed, 52 insertions, 92 deletions
@@ -990,7 +990,7 @@ EXTCONST U32 PL_charclass[]; /* The 1U keeps Solaris from griping when shifting sets the uppermost bit */ # define _CC_mask(classnum) (1U << (classnum)) # define _generic_isCC(c, classnum) cBOOL(FITS_IN_8_BITS(c) \ - && (PL_charclass[(U8) NATIVE_TO_LATIN1(c)] & _CC_mask(classnum))) + && (PL_charclass[(U8) (c)] & _CC_mask(classnum))) /* The mask for the _A versions of the macros; it just adds in the bit for * ASCII. */ @@ -999,7 +999,7 @@ EXTCONST U32 PL_charclass[]; /* The _A version makes sure that both the desired bit and the ASCII bit * are present */ # define _generic_isCC_A(c, classnum) (FITS_IN_8_BITS(c) \ - && ((PL_charclass[(U8) NATIVE_TO_LATIN1(c)] & _CC_mask_A(classnum)) \ + && ((PL_charclass[(U8) (c)] & _CC_mask_A(classnum)) \ == _CC_mask_A(classnum))) # define isALPHA_A(c) _generic_isCC_A(c, _CC_ALPHA) @@ -1020,7 +1020,7 @@ EXTCONST U32 PL_charclass[]; /* Either participates in a fold with a character above 255, or is a * multi-char fold */ -# define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) NATIVE_TO_LATIN1(c)] & _CC_mask(_CC_NONLATIN1_FOLD))) +# define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) (c)] & _CC_mask(_CC_NONLATIN1_FOLD))) # define _isQUOTEMETA(c) _generic_isCC(c, _CC_QUOTEMETA) # define _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) \ @@ -1177,8 +1177,7 @@ EXTCONST U32 PL_charclass[]; * out-of-range */ #define toLOWER_LATIN1(c) ((! FITS_IN_8_BITS(c)) \ ? (c) \ - : LATIN1_TO_NATIVE(PL_latin1_lc[ \ - NATIVE_TO_LATIN1( (U8) (c)) ])) + : PL_latin1_lc[ (U8) (c) ]) #define toLOWER_L1(c) toLOWER_LATIN1(c) /* Synonym for consistency */ /* Modified uc. Is correct uc except for three non-ascii chars which are @@ -1186,8 +1185,7 @@ EXTCONST U32 PL_charclass[]; * character for input out-of-range */ #define toUPPER_LATIN1_MOD(c) ((! FITS_IN_8_BITS(c)) \ ? (c) \ - : LATIN1_TO_NATIVE(PL_mod_latin1_uc[ \ - NATIVE_TO_LATIN1( (U8) (c)) ])) + : PL_mod_latin1_uc[ (U8) (c) ]) #ifdef USE_NEXT_CTYPE # define isALPHANUMERIC_LC(c) NXIsAlNum((unsigned int)(c)) @@ -342,9 +342,12 @@ Perl_yyparse (pTHX_ int gramtype) parser->yychar = yylex(); #endif +/* perly.tab is shipped based on an ASCII system; if it were to be regenerated + * on a platform that doesn't use ASCII, this translation back would need to be + * removed */ # ifdef EBCDIC if (parser->yychar >= 0 && parser->yychar < 255) { - parser->yychar = NATIVE_TO_ASCII(parser->yychar); + parser->yychar = NATIVE_TO_LATIN1(parser->yychar); } # endif } @@ -4244,7 +4244,7 @@ PP(pp_fc) for (; s < send; s++) { STRLEN ulen; UV fc = _to_uni_fold_flags(*s, tmpbuf, &ulen, flags); - if UNI_IS_INVARIANT(fc) { + if NATIVE_IS_INVARIANT(fc) { if (full_folding && *s == LATIN_SMALL_LETTER_SHARP_S) { @@ -1737,7 +1737,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( !UTF ) { /* store first byte of utf8 representation of variant codepoints */ - if (! UNI_IS_INVARIANT(uvc)) { + if (! NATIVE_IS_INVARIANT(uvc)) { TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); } } @@ -4276,8 +4276,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", if (!(data->start_class->flags & ANYOF_LOCALE)) { ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1); for (value = 0; value < loop_max; value++) { - if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value)); + if (! _generic_isCC(value, classnum)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); } } } @@ -4292,8 +4292,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", * in case it isn't a true locale-node. This will * create false positives if it truly is locale */ for (value = 0; value < loop_max; value++) { - if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value)); + if (_generic_isCC(value, classnum)) { + ANYOF_BITMAP_SET(data->start_class, value); } } } @@ -4310,8 +4310,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", if (!(data->start_class->flags & ANYOF_LOCALE)) { ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum)); for (value = 0; value < loop_max; value++) { - if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value)); + if (_generic_isCC(value, classnum)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); } } } @@ -4326,8 +4326,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", * case it isn't a true locale-node. This will create * false positives if it truly is locale */ for (value = 0; value < loop_max; value++) { - if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value)); + if (! _generic_isCC(value, classnum)) { + ANYOF_BITMAP_SET(data->start_class, value); } } if (PL_regkind[OP(scan)] == NPOSIXD) { @@ -10183,7 +10183,7 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 if (! len_passed_in) { if (UTF) { if (FOLD && (! LOC || code_point > 255)) { - _to_uni_fold_flags(NATIVE_TO_UNI(code_point), + _to_uni_fold_flags(code_point, character, &len, FOLD_FLAGS_FULL | ((LOC) @@ -1629,13 +1629,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case BOUNDL: RXp_MATCH_TAINTED_on(prog); FBC_BOUND(isWORDCHAR_LC, - isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)), + isWORDCHAR_LC_uvchr(tmp), isWORDCHAR_LC_utf8((U8*)s)); break; case NBOUNDL: RXp_MATCH_TAINTED_on(prog); FBC_NBOUND(isWORDCHAR_LC, - isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)), + isWORDCHAR_LC_uvchr(tmp), isWORDCHAR_LC_utf8((U8*)s)); break; case BOUND: @@ -4305,7 +4305,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } } else { - ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln)); + ln = isWORDCHAR_LC_uvchr(ln); n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput); } } @@ -1407,11 +1407,7 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */ p += 2; if (uv < 0x80) { -#ifdef EBCDIC - *d++ = LATIN1_TO_NATIVE(uv); -#else *d++ = (U8)uv; -#endif continue; } if (uv < 0x800) { @@ -1645,8 +1641,8 @@ Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_ assert(S_or_s == 'S' || S_or_s == 's'); - if (UNI_IS_INVARIANT(converted)) { /* No difference between the two for - characters in this range */ + if (NATIVE_IS_INVARIANT(converted)) { /* No difference between the two for + characters in this range */ *p = (U8) converted; *lenp = 1; return converted; @@ -1746,7 +1742,7 @@ S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp) U8 converted = toLOWER_LATIN1(c); if (p != NULL) { - if (UNI_IS_INVARIANT(converted)) { + if (NATIVE_IS_INVARIANT(converted)) { *p = converted; *lenp = 1; } @@ -1816,7 +1812,7 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int f converted = toLOWER_LATIN1(c); } - if (UNI_IS_INVARIANT(converted)) { + if (NATIVE_IS_INVARIANT(converted)) { *p = (U8) converted; *lenp = 1; } @@ -1869,7 +1865,7 @@ bool Perl_is_uni_alnum_lc(pTHX_ UV c) { if (c < 256) { - return isALNUM_LC(UNI_TO_NATIVE(c)); + return isALNUM_LC(c); } return _is_uni_FOO(_CC_WORDCHAR, c); } @@ -1878,7 +1874,7 @@ bool Perl_is_uni_alnumc_lc(pTHX_ UV c) { if (c < 256) { - return isALPHANUMERIC_LC(UNI_TO_NATIVE(c)); + return isALPHANUMERIC_LC(c); } return _is_uni_FOO(_CC_ALPHANUMERIC, c); } @@ -1887,7 +1883,7 @@ bool Perl_is_uni_idfirst_lc(pTHX_ UV c) { if (c < 256) { - return isIDFIRST_LC(UNI_TO_NATIVE(c)); + return isIDFIRST_LC(c); } return _is_uni_perl_idstart(c); } @@ -1896,7 +1892,7 @@ bool Perl_is_uni_alpha_lc(pTHX_ UV c) { if (c < 256) { - return isALPHA_LC(UNI_TO_NATIVE(c)); + return isALPHA_LC(c); } return _is_uni_FOO(_CC_ALPHA, c); } @@ -1905,7 +1901,7 @@ bool Perl_is_uni_ascii_lc(pTHX_ UV c) { if (c < 256) { - return isASCII_LC(UNI_TO_NATIVE(c)); + return isASCII_LC(c); } return 0; } @@ -1914,7 +1910,7 @@ bool Perl_is_uni_blank_lc(pTHX_ UV c) { if (c < 256) { - return isBLANK_LC(UNI_TO_NATIVE(c)); + return isBLANK_LC(c); } return isBLANK_uni(c); } @@ -1923,7 +1919,7 @@ bool Perl_is_uni_space_lc(pTHX_ UV c) { if (c < 256) { - return isSPACE_LC(UNI_TO_NATIVE(c)); + return isSPACE_LC(c); } return isSPACE_uni(c); } @@ -1932,7 +1928,7 @@ bool Perl_is_uni_digit_lc(pTHX_ UV c) { if (c < 256) { - return isDIGIT_LC(UNI_TO_NATIVE(c)); + return isDIGIT_LC(c); } return _is_uni_FOO(_CC_DIGIT, c); } @@ -1941,7 +1937,7 @@ bool Perl_is_uni_upper_lc(pTHX_ UV c) { if (c < 256) { - return isUPPER_LC(UNI_TO_NATIVE(c)); + return isUPPER_LC(c); } return _is_uni_FOO(_CC_UPPER, c); } @@ -1950,7 +1946,7 @@ bool Perl_is_uni_lower_lc(pTHX_ UV c) { if (c < 256) { - return isLOWER_LC(UNI_TO_NATIVE(c)); + return isLOWER_LC(c); } return _is_uni_FOO(_CC_LOWER, c); } @@ -1959,7 +1955,7 @@ bool Perl_is_uni_cntrl_lc(pTHX_ UV c) { if (c < 256) { - return isCNTRL_LC(UNI_TO_NATIVE(c)); + return isCNTRL_LC(c); } return 0; } @@ -1968,7 +1964,7 @@ bool Perl_is_uni_graph_lc(pTHX_ UV c) { if (c < 256) { - return isGRAPH_LC(UNI_TO_NATIVE(c)); + return isGRAPH_LC(c); } return _is_uni_FOO(_CC_GRAPH, c); } @@ -1977,7 +1973,7 @@ bool Perl_is_uni_print_lc(pTHX_ UV c) { if (c < 256) { - return isPRINT_LC(UNI_TO_NATIVE(c)); + return isPRINT_LC(c); } return _is_uni_FOO(_CC_PRINT, c); } @@ -1986,7 +1982,7 @@ bool Perl_is_uni_punct_lc(pTHX_ UV c) { if (c < 256) { - return isPUNCT_LC(UNI_TO_NATIVE(c)); + return isPUNCT_LC(c); } return _is_uni_FOO(_CC_PUNCT, c); } @@ -1995,7 +1991,7 @@ bool Perl_is_uni_xdigit_lc(pTHX_ UV c) { if (c < 256) { - return isXDIGIT_LC(UNI_TO_NATIVE(c)); + return isXDIGIT_LC(c); } return isXDIGIT_uni(c); } @@ -2382,13 +2378,8 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, const char *normal, const char *special) { dVAR; - U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; STRLEN len = 0; - const UV uv0 = valid_utf8_to_uvchr(p, NULL); - /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings - * are necessary in EBCDIC, they are redundant no-ops - * in ASCII-ish platforms, and hopefully optimized away. */ - const UV uv1 = NATIVE_TO_UNI(uv0); + const UV uv1 = valid_utf8_to_uvchr(p, NULL); PERL_ARGS_ASSERT_TO_UTF8_CASE; @@ -2414,8 +2405,6 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, * be given */ } - uvuni_to_utf8(tmpbuf, uv1); - if (!*swashp) /* load on-demand */ *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL); @@ -2426,56 +2415,26 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, SV **svp; if (hv && - (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) && + (svp = hv_fetch(hv, (const char*)p, UNISKIP(uv1), FALSE)) && (*svp)) { const char *s; s = SvPV_const(*svp, len); if (len == 1) + /* EIGHTBIT */ len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp; else { -#ifdef EBCDIC - /* If we have EBCDIC we need to remap the characters - * since any characters in the low 256 are Unicode - * code points, not EBCDIC. */ - U8 *t = (U8*)s, *tend = t + len, *d; - - d = tmpbuf; - if (SvUTF8(*svp)) { - STRLEN tlen = 0; - - while (t < tend) { - const UV c = utf8_to_uvchr_buf(t, tend, &tlen); - if (tlen > 0) { - d = uvchr_to_utf8(d, UNI_TO_NATIVE(c)); - t += tlen; - } - else - break; - } - } - else { - while (t < tend) { - d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t)); - t++; - } - } - len = d - tmpbuf; - Copy(tmpbuf, ustrp, len, U8); -#else Copy(s, ustrp, len, U8); -#endif } } } if (!len && *swashp) { - const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE /* => is utf8 */); + const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is utf8 */); if (uv2) { /* It was "normal" (a single character mapping). */ - const UV uv3 = UNI_TO_NATIVE(uv2); - len = uvchr_to_utf8(ustrp, uv3) - ustrp; + len = uvchr_to_utf8(ustrp, uv2) - ustrp; } } @@ -2496,7 +2455,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, if (lenp) *lenp = len; - return uv0; + return uv1; } @@ -3195,7 +3154,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) U32 bit; SV *swatch; U8 tmputf8[2]; - const UV c = NATIVE_TO_ASCII(*ptr); + const UV c = *ptr; PERL_ARGS_ASSERT_SWASH_FETCH; @@ -3209,7 +3168,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) } /* Convert to utf8 if not already */ - if (!do_utf8 && !UNI_IS_INVARIANT(c)) { + if (!do_utf8 && !NATIVE_IS_INVARIANT(c)) { tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c); tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c); ptr = tmputf8; @@ -4605,7 +4564,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c to_utf8_fold(p1, foldbuf1, &n1); } else { /* Not utf8, get utf8 fold */ - to_uni_fold(NATIVE_TO_LATIN1(*p1), foldbuf1, &n1); + to_uni_fold(*p1, foldbuf1, &n1); } f1 = foldbuf1; } @@ -4650,7 +4609,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c to_utf8_fold(p2, foldbuf2, &n2); } else { - to_uni_fold(NATIVE_TO_LATIN1(*p2), foldbuf2, &n2); + to_uni_fold(*p2, foldbuf2, &n2); } f2 = foldbuf2; } |