diff options
-rw-r--r-- | handy.h | 12 | ||||
-rw-r--r-- | perly.c | 5 | ||||
-rw-r--r-- | pp.c | 2 | ||||
-rw-r--r-- | regcomp.c | 20 | ||||
-rw-r--r-- | regexec.c | 6 | ||||
-rw-r--r-- | utf8.c | 99 |
6 files changed, 52 insertions, 92 deletions
@@ -990,7 +990,7 @@ EXTCONST U32 PL_charclass[]; /* The 1U keeps Solaris from griping when shifting sets the uppermost bit */ # define _CC_mask(classnum) (1U << (classnum)) # define _generic_isCC(c, classnum) cBOOL(FITS_IN_8_BITS(c) \ - && (PL_charclass[(U8) NATIVE_TO_LATIN1(c)] & _CC_mask(classnum))) + && (PL_charclass[(U8) (c)] & _CC_mask(classnum))) /* The mask for the _A versions of the macros; it just adds in the bit for * ASCII. */ @@ -999,7 +999,7 @@ EXTCONST U32 PL_charclass[]; /* The _A version makes sure that both the desired bit and the ASCII bit * are present */ # define _generic_isCC_A(c, classnum) (FITS_IN_8_BITS(c) \ - && ((PL_charclass[(U8) NATIVE_TO_LATIN1(c)] & _CC_mask_A(classnum)) \ + && ((PL_charclass[(U8) (c)] & _CC_mask_A(classnum)) \ == _CC_mask_A(classnum))) # define isALPHA_A(c) _generic_isCC_A(c, _CC_ALPHA) @@ -1020,7 +1020,7 @@ EXTCONST U32 PL_charclass[]; /* Either participates in a fold with a character above 255, or is a * multi-char fold */ -# define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) NATIVE_TO_LATIN1(c)] & _CC_mask(_CC_NONLATIN1_FOLD))) +# define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) (c)] & _CC_mask(_CC_NONLATIN1_FOLD))) # define _isQUOTEMETA(c) _generic_isCC(c, _CC_QUOTEMETA) # define _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) \ @@ -1177,8 +1177,7 @@ EXTCONST U32 PL_charclass[]; * out-of-range */ #define toLOWER_LATIN1(c) ((! FITS_IN_8_BITS(c)) \ ? (c) \ - : LATIN1_TO_NATIVE(PL_latin1_lc[ \ - NATIVE_TO_LATIN1( (U8) (c)) ])) + : PL_latin1_lc[ (U8) (c) ]) #define toLOWER_L1(c) toLOWER_LATIN1(c) /* Synonym for consistency */ /* Modified uc. Is correct uc except for three non-ascii chars which are @@ -1186,8 +1185,7 @@ EXTCONST U32 PL_charclass[]; * character for input out-of-range */ #define toUPPER_LATIN1_MOD(c) ((! FITS_IN_8_BITS(c)) \ ? (c) \ - : LATIN1_TO_NATIVE(PL_mod_latin1_uc[ \ - NATIVE_TO_LATIN1( (U8) (c)) ])) + : PL_mod_latin1_uc[ (U8) (c) ]) #ifdef USE_NEXT_CTYPE # define isALPHANUMERIC_LC(c) NXIsAlNum((unsigned int)(c)) @@ -342,9 +342,12 @@ Perl_yyparse (pTHX_ int gramtype) parser->yychar = yylex(); #endif +/* perly.tab is shipped based on an ASCII system; if it were to be regenerated + * on a platform that doesn't use ASCII, this translation back would need to be + * removed */ # ifdef EBCDIC if (parser->yychar >= 0 && parser->yychar < 255) { - parser->yychar = NATIVE_TO_ASCII(parser->yychar); + parser->yychar = NATIVE_TO_LATIN1(parser->yychar); } # endif } @@ -4244,7 +4244,7 @@ PP(pp_fc) for (; s < send; s++) { STRLEN ulen; UV fc = _to_uni_fold_flags(*s, tmpbuf, &ulen, flags); - if UNI_IS_INVARIANT(fc) { + if NATIVE_IS_INVARIANT(fc) { if (full_folding && *s == LATIN_SMALL_LETTER_SHARP_S) { @@ -1737,7 +1737,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( !UTF ) { /* store first byte of utf8 representation of variant codepoints */ - if (! UNI_IS_INVARIANT(uvc)) { + if (! NATIVE_IS_INVARIANT(uvc)) { TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); } } @@ -4276,8 +4276,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", if (!(data->start_class->flags & ANYOF_LOCALE)) { ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1); for (value = 0; value < loop_max; value++) { - if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value)); + if (! _generic_isCC(value, classnum)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); } } } @@ -4292,8 +4292,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", * in case it isn't a true locale-node. This will * create false positives if it truly is locale */ for (value = 0; value < loop_max; value++) { - if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value)); + if (_generic_isCC(value, classnum)) { + ANYOF_BITMAP_SET(data->start_class, value); } } } @@ -4310,8 +4310,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", if (!(data->start_class->flags & ANYOF_LOCALE)) { ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum)); for (value = 0; value < loop_max; value++) { - if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value)); + if (_generic_isCC(value, classnum)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); } } } @@ -4326,8 +4326,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", * case it isn't a true locale-node. This will create * false positives if it truly is locale */ for (value = 0; value < loop_max; value++) { - if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value)); + if (! _generic_isCC(value, classnum)) { + ANYOF_BITMAP_SET(data->start_class, value); } } if (PL_regkind[OP(scan)] == NPOSIXD) { @@ -10183,7 +10183,7 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 if (! len_passed_in) { if (UTF) { if (FOLD && (! LOC || code_point > 255)) { - _to_uni_fold_flags(NATIVE_TO_UNI(code_point), + _to_uni_fold_flags(code_point, character, &len, FOLD_FLAGS_FULL | ((LOC) @@ -1629,13 +1629,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case BOUNDL: RXp_MATCH_TAINTED_on(prog); FBC_BOUND(isWORDCHAR_LC, - isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)), + isWORDCHAR_LC_uvchr(tmp), isWORDCHAR_LC_utf8((U8*)s)); break; case NBOUNDL: RXp_MATCH_TAINTED_on(prog); FBC_NBOUND(isWORDCHAR_LC, - isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)), + isWORDCHAR_LC_uvchr(tmp), isWORDCHAR_LC_utf8((U8*)s)); break; case BOUND: @@ -4305,7 +4305,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } } else { - ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln)); + ln = isWORDCHAR_LC_uvchr(ln); n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput); } } @@ -1407,11 +1407,7 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */ p += 2; if (uv < 0x80) { -#ifdef EBCDIC - *d++ = LATIN1_TO_NATIVE(uv); -#else *d++ = (U8)uv; -#endif continue; } if (uv < 0x800) { @@ -1645,8 +1641,8 @@ Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_ assert(S_or_s == 'S' || S_or_s == 's'); - if (UNI_IS_INVARIANT(converted)) { /* No difference between the two for - characters in this range */ + if (NATIVE_IS_INVARIANT(converted)) { /* No difference between the two for + characters in this range */ *p = (U8) converted; *lenp = 1; return converted; @@ -1746,7 +1742,7 @@ S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp) U8 converted = toLOWER_LATIN1(c); if (p != NULL) { - if (UNI_IS_INVARIANT(converted)) { + if (NATIVE_IS_INVARIANT(converted)) { *p = converted; *lenp = 1; } @@ -1816,7 +1812,7 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int f converted = toLOWER_LATIN1(c); } - if (UNI_IS_INVARIANT(converted)) { + if (NATIVE_IS_INVARIANT(converted)) { *p = (U8) converted; *lenp = 1; } @@ -1869,7 +1865,7 @@ bool Perl_is_uni_alnum_lc(pTHX_ UV c) { if (c < 256) { - return isALNUM_LC(UNI_TO_NATIVE(c)); + return isALNUM_LC(c); } return _is_uni_FOO(_CC_WORDCHAR, c); } @@ -1878,7 +1874,7 @@ bool Perl_is_uni_alnumc_lc(pTHX_ UV c) { if (c < 256) { - return isALPHANUMERIC_LC(UNI_TO_NATIVE(c)); + return isALPHANUMERIC_LC(c); } return _is_uni_FOO(_CC_ALPHANUMERIC, c); } @@ -1887,7 +1883,7 @@ bool Perl_is_uni_idfirst_lc(pTHX_ UV c) { if (c < 256) { - return isIDFIRST_LC(UNI_TO_NATIVE(c)); + return isIDFIRST_LC(c); } return _is_uni_perl_idstart(c); } @@ -1896,7 +1892,7 @@ bool Perl_is_uni_alpha_lc(pTHX_ UV c) { if (c < 256) { - return isALPHA_LC(UNI_TO_NATIVE(c)); + return isALPHA_LC(c); } return _is_uni_FOO(_CC_ALPHA, c); } @@ -1905,7 +1901,7 @@ bool Perl_is_uni_ascii_lc(pTHX_ UV c) { if (c < 256) { - return isASCII_LC(UNI_TO_NATIVE(c)); + return isASCII_LC(c); } return 0; } @@ -1914,7 +1910,7 @@ bool Perl_is_uni_blank_lc(pTHX_ UV c) { if (c < 256) { - return isBLANK_LC(UNI_TO_NATIVE(c)); + return isBLANK_LC(c); } return isBLANK_uni(c); } @@ -1923,7 +1919,7 @@ bool Perl_is_uni_space_lc(pTHX_ UV c) { if (c < 256) { - return isSPACE_LC(UNI_TO_NATIVE(c)); + return isSPACE_LC(c); } return isSPACE_uni(c); } @@ -1932,7 +1928,7 @@ bool Perl_is_uni_digit_lc(pTHX_ UV c) { if (c < 256) { - return isDIGIT_LC(UNI_TO_NATIVE(c)); + return isDIGIT_LC(c); } return _is_uni_FOO(_CC_DIGIT, c); } @@ -1941,7 +1937,7 @@ bool Perl_is_uni_upper_lc(pTHX_ UV c) { if (c < 256) { - return isUPPER_LC(UNI_TO_NATIVE(c)); + return isUPPER_LC(c); } return _is_uni_FOO(_CC_UPPER, c); } @@ -1950,7 +1946,7 @@ bool Perl_is_uni_lower_lc(pTHX_ UV c) { if (c < 256) { - return isLOWER_LC(UNI_TO_NATIVE(c)); + return isLOWER_LC(c); } return _is_uni_FOO(_CC_LOWER, c); } @@ -1959,7 +1955,7 @@ bool Perl_is_uni_cntrl_lc(pTHX_ UV c) { if (c < 256) { - return isCNTRL_LC(UNI_TO_NATIVE(c)); + return isCNTRL_LC(c); } return 0; } @@ -1968,7 +1964,7 @@ bool Perl_is_uni_graph_lc(pTHX_ UV c) { if (c < 256) { - return isGRAPH_LC(UNI_TO_NATIVE(c)); + return isGRAPH_LC(c); } return _is_uni_FOO(_CC_GRAPH, c); } @@ -1977,7 +1973,7 @@ bool Perl_is_uni_print_lc(pTHX_ UV c) { if (c < 256) { - return isPRINT_LC(UNI_TO_NATIVE(c)); + return isPRINT_LC(c); } return _is_uni_FOO(_CC_PRINT, c); } @@ -1986,7 +1982,7 @@ bool Perl_is_uni_punct_lc(pTHX_ UV c) { if (c < 256) { - return isPUNCT_LC(UNI_TO_NATIVE(c)); + return isPUNCT_LC(c); } return _is_uni_FOO(_CC_PUNCT, c); } @@ -1995,7 +1991,7 @@ bool Perl_is_uni_xdigit_lc(pTHX_ UV c) { if (c < 256) { - return isXDIGIT_LC(UNI_TO_NATIVE(c)); + return isXDIGIT_LC(c); } return isXDIGIT_uni(c); } @@ -2382,13 +2378,8 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, const char *normal, const char *special) { dVAR; - U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; STRLEN len = 0; - const UV uv0 = valid_utf8_to_uvchr(p, NULL); - /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings - * are necessary in EBCDIC, they are redundant no-ops - * in ASCII-ish platforms, and hopefully optimized away. */ - const UV uv1 = NATIVE_TO_UNI(uv0); + const UV uv1 = valid_utf8_to_uvchr(p, NULL); PERL_ARGS_ASSERT_TO_UTF8_CASE; @@ -2414,8 +2405,6 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, * be given */ } - uvuni_to_utf8(tmpbuf, uv1); - if (!*swashp) /* load on-demand */ *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL); @@ -2426,56 +2415,26 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, SV **svp; if (hv && - (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) && + (svp = hv_fetch(hv, (const char*)p, UNISKIP(uv1), FALSE)) && (*svp)) { const char *s; s = SvPV_const(*svp, len); if (len == 1) + /* EIGHTBIT */ len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp; else { -#ifdef EBCDIC - /* If we have EBCDIC we need to remap the characters - * since any characters in the low 256 are Unicode - * code points, not EBCDIC. */ - U8 *t = (U8*)s, *tend = t + len, *d; - - d = tmpbuf; - if (SvUTF8(*svp)) { - STRLEN tlen = 0; - - while (t < tend) { - const UV c = utf8_to_uvchr_buf(t, tend, &tlen); - if (tlen > 0) { - d = uvchr_to_utf8(d, UNI_TO_NATIVE(c)); - t += tlen; - } - else - break; - } - } - else { - while (t < tend) { - d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t)); - t++; - } - } - len = d - tmpbuf; - Copy(tmpbuf, ustrp, len, U8); -#else Copy(s, ustrp, len, U8); -#endif } } } if (!len && *swashp) { - const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE /* => is utf8 */); + const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is utf8 */); if (uv2) { /* It was "normal" (a single character mapping). */ - const UV uv3 = UNI_TO_NATIVE(uv2); - len = uvchr_to_utf8(ustrp, uv3) - ustrp; + len = uvchr_to_utf8(ustrp, uv2) - ustrp; } } @@ -2496,7 +2455,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, if (lenp) *lenp = len; - return uv0; + return uv1; } @@ -3195,7 +3154,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) U32 bit; SV *swatch; U8 tmputf8[2]; - const UV c = NATIVE_TO_ASCII(*ptr); + const UV c = *ptr; PERL_ARGS_ASSERT_SWASH_FETCH; @@ -3209,7 +3168,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) } /* Convert to utf8 if not already */ - if (!do_utf8 && !UNI_IS_INVARIANT(c)) { + if (!do_utf8 && !NATIVE_IS_INVARIANT(c)) { tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c); tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c); ptr = tmputf8; @@ -4605,7 +4564,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c to_utf8_fold(p1, foldbuf1, &n1); } else { /* Not utf8, get utf8 fold */ - to_uni_fold(NATIVE_TO_LATIN1(*p1), foldbuf1, &n1); + to_uni_fold(*p1, foldbuf1, &n1); } f1 = foldbuf1; } @@ -4650,7 +4609,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c to_utf8_fold(p2, foldbuf2, &n2); } else { - to_uni_fold(NATIVE_TO_LATIN1(*p2), foldbuf2, &n2); + to_uni_fold(*p2, foldbuf2, &n2); } f2 = foldbuf2; } |