Remove EBCDIC remappings

Now that the Unicode tables are stored in native format, we shouldn't be doing remapping. Note that this assumes that the Latin1 casing tables are stored in native order; not all of this has been done yet.
author: Karl Williamson <public@khwilliamson.com> 2013-02-17 13:47:13 -0700
committer: Karl Williamson <public@khwilliamson.com> 2013-08-29 09:55:52 -0600
commit: f4cd282cd75f0333d0ce591c83812eed609d5283 (patch)
tree: 5b2e6e4c08b6fe67d75a9d93dea211e71988c3ca
parent: 94bb8c36d9e11dd4825e43d06f0832f01a7e5045 (diff)
download: perl-f4cd282cd75f0333d0ce591c83812eed609d5283.tar.gz
6 files changed, 52 insertions, 92 deletions
diff --git a/handy.h b/handy.h
index 73fdacace2..7b6db333df 100644
--- a/handy.h
+++ b/handy.h
@@ -990,7 +990,7 @@ EXTCONST U32 PL_charclass[];
     /* The 1U keeps Solaris from griping when shifting sets the uppermost bit */
 #   define _CC_mask(classnum) (1U << (classnum))
 #   define _generic_isCC(c, classnum) cBOOL(FITS_IN_8_BITS(c) \
-                && (PL_charclass[(U8) NATIVE_TO_LATIN1(c)] & _CC_mask(classnum)))
+                && (PL_charclass[(U8) (c)] & _CC_mask(classnum)))
 
     /* The mask for the _A versions of the macros; it just adds in the bit for
      * ASCII. */
@@ -999,7 +999,7 @@ EXTCONST U32 PL_charclass[];
     /* The _A version makes sure that both the desired bit and the ASCII bit
      * are present */
 #   define _generic_isCC_A(c, classnum) (FITS_IN_8_BITS(c) \
-        && ((PL_charclass[(U8) NATIVE_TO_LATIN1(c)] & _CC_mask_A(classnum)) \
+        && ((PL_charclass[(U8) (c)] & _CC_mask_A(classnum)) \
                                 == _CC_mask_A(classnum)))
 
 #   define isALPHA_A(c)  _generic_isCC_A(c, _CC_ALPHA)
@@ -1020,7 +1020,7 @@ EXTCONST U32 PL_charclass[];
 
     /* Either participates in a fold with a character above 255, or is a
      * multi-char fold */
-#   define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) NATIVE_TO_LATIN1(c)] & _CC_mask(_CC_NONLATIN1_FOLD)))
+#   define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) (c)] & _CC_mask(_CC_NONLATIN1_FOLD)))
 
 #   define _isQUOTEMETA(c) _generic_isCC(c, _CC_QUOTEMETA)
 #   define _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) \
@@ -1177,8 +1177,7 @@ EXTCONST U32 PL_charclass[];
  * out-of-range */
 #define toLOWER_LATIN1(c)    ((! FITS_IN_8_BITS(c))                        \
                              ? (c)                                         \
-                             : LATIN1_TO_NATIVE(PL_latin1_lc[              \
-                                            NATIVE_TO_LATIN1( (U8) (c)) ]))
+                             : PL_latin1_lc[ (U8) (c) ])
 #define toLOWER_L1(c)    toLOWER_LATIN1(c)  /* Synonym for consistency */
 
 /* Modified uc.  Is correct uc except for three non-ascii chars which are
@@ -1186,8 +1185,7 @@ EXTCONST U32 PL_charclass[];
  * character for input out-of-range */
 #define toUPPER_LATIN1_MOD(c) ((! FITS_IN_8_BITS(c))                       \
                                ? (c)                                       \
-                               : LATIN1_TO_NATIVE(PL_mod_latin1_uc[        \
-                                            NATIVE_TO_LATIN1( (U8) (c)) ]))
+                               : PL_mod_latin1_uc[ (U8) (c) ])
 #ifdef USE_NEXT_CTYPE
 
 #  define isALPHANUMERIC_LC(c)	NXIsAlNum((unsigned int)(c))
diff --git a/perly.c b/perly.c
index d7d9ea34c6..5a934dccd9 100644
--- a/perly.c
+++ b/perly.c
@@ -342,9 +342,12 @@ Perl_yyparse (pTHX_ int gramtype)
 	parser->yychar = yylex();
 #endif
 
+/* perly.tab is shipped based on an ASCII system; if it were to be regenerated
+ * on a platform that doesn't use ASCII, this translation back would need to be
+ * removed */
 #  ifdef EBCDIC
 	if (parser->yychar >= 0 && parser->yychar < 255) {
-	    parser->yychar = NATIVE_TO_ASCII(parser->yychar);
+	    parser->yychar = NATIVE_TO_LATIN1(parser->yychar);
 	}
 #  endif
     }
diff --git a/pp.c b/pp.c
index cd50626696..111012d3a0 100644
--- a/pp.c
+++ b/pp.c
@@ -4244,7 +4244,7 @@ PP(pp_fc)
                     for (; s < send; s++) {
                         STRLEN ulen;
                         UV fc = _to_uni_fold_flags(*s, tmpbuf, &ulen, flags);
-                        if UNI_IS_INVARIANT(fc) {
+                        if NATIVE_IS_INVARIANT(fc) {
                             if (full_folding
                                 && *s == LATIN_SMALL_LETTER_SHARP_S)
                             {
diff --git a/regcomp.c b/regcomp.c
index 34aefa107d..bb89a5443d 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -1737,7 +1737,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
 		    if ( !UTF ) {
 			/* store first byte of utf8 representation of
 			   variant codepoints */
-			if (! UNI_IS_INVARIANT(uvc)) {
+			if (! NATIVE_IS_INVARIANT(uvc)) {
 			    TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
 			}
 		    }
@@ -4276,8 +4276,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
 			if (!(data->start_class->flags & ANYOF_LOCALE)) {
 			    ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
                             for (value = 0; value < loop_max; value++) {
-                                if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
-                                    ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
+                                if (! _generic_isCC(value, classnum)) {
+                                    ANYOF_BITMAP_CLEAR(data->start_class, value);
                                 }
                             }
 			}
@@ -4292,8 +4292,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
 			 * in case it isn't a true locale-node.  This will
 			 * create false positives if it truly is locale */
                         for (value = 0; value < loop_max; value++) {
-                            if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
-                                ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
+                            if (_generic_isCC(value, classnum)) {
+                                ANYOF_BITMAP_SET(data->start_class, value);
                             }
                         }
                         }
@@ -4310,8 +4310,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
 			if (!(data->start_class->flags & ANYOF_LOCALE)) {
 			    ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
                             for (value = 0; value < loop_max; value++) {
-                                if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
-                                    ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
+                                if (_generic_isCC(value, classnum)) {
+                                    ANYOF_BITMAP_CLEAR(data->start_class, value);
                                 }
                             }
 			}
@@ -4326,8 +4326,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
 			 * case it isn't a true locale-node.  This will create
 			 * false positives if it truly is locale */
                         for (value = 0; value < loop_max; value++) {
-                            if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
-                                ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
+                            if (! _generic_isCC(value, classnum)) {
+                                ANYOF_BITMAP_SET(data->start_class, value);
                             }
                         }
                         if (PL_regkind[OP(scan)] == NPOSIXD) {
@@ -10183,7 +10183,7 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32
     if (! len_passed_in) {
         if (UTF) {
             if (FOLD && (! LOC || code_point > 255)) {
-                _to_uni_fold_flags(NATIVE_TO_UNI(code_point),
+                _to_uni_fold_flags(code_point,
                                    character,
                                    &len,
                                    FOLD_FLAGS_FULL | ((LOC)
diff --git a/regexec.c b/regexec.c
index 384e4e79b7..db6b7306c2 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1629,13 +1629,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
     case BOUNDL:
         RXp_MATCH_TAINTED_on(prog);
         FBC_BOUND(isWORDCHAR_LC,
-                  isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
+                  isWORDCHAR_LC_uvchr(tmp),
                   isWORDCHAR_LC_utf8((U8*)s));
         break;
     case NBOUNDL:
         RXp_MATCH_TAINTED_on(prog);
         FBC_NBOUND(isWORDCHAR_LC,
-                   isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
+                   isWORDCHAR_LC_uvchr(tmp),
                    isWORDCHAR_LC_utf8((U8*)s));
         break;
     case BOUND:
@@ -4305,7 +4305,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                     }
 		}
 		else {
-		    ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln));
+		    ln = isWORDCHAR_LC_uvchr(ln);
 		    n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
 		}
 	    }
diff --git a/utf8.c b/utf8.c
index 2d827a1c7a..8d7e6de3bd 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1407,11 +1407,7 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 	p += 2;
 	if (uv < 0x80) {
-#ifdef EBCDIC
-	    *d++ = LATIN1_TO_NATIVE(uv);
-#else
 	    *d++ = (U8)uv;
-#endif
 	    continue;
 	}
 	if (uv < 0x800) {
@@ -1645,8 +1641,8 @@ Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_
 
     assert(S_or_s == 'S' || S_or_s == 's');
 
-    if (UNI_IS_INVARIANT(converted)) { /* No difference between the two for
-					  characters in this range */
+    if (NATIVE_IS_INVARIANT(converted)) { /* No difference between the two for
+					     characters in this range */
 	*p = (U8) converted;
 	*lenp = 1;
 	return converted;
@@ -1746,7 +1742,7 @@ S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp)
     U8 converted = toLOWER_LATIN1(c);
 
     if (p != NULL) {
-	if (UNI_IS_INVARIANT(converted)) {
+	if (NATIVE_IS_INVARIANT(converted)) {
 	    *p = converted;
 	    *lenp = 1;
 	}
@@ -1816,7 +1812,7 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int f
 	converted = toLOWER_LATIN1(c);
     }
 
-    if (UNI_IS_INVARIANT(converted)) {
+    if (NATIVE_IS_INVARIANT(converted)) {
 	*p = (U8) converted;
 	*lenp = 1;
     }
@@ -1869,7 +1865,7 @@ bool
 Perl_is_uni_alnum_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isALNUM_LC(UNI_TO_NATIVE(c));
+        return isALNUM_LC(c);
     }
     return _is_uni_FOO(_CC_WORDCHAR, c);
 }
@@ -1878,7 +1874,7 @@ bool
 Perl_is_uni_alnumc_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isALPHANUMERIC_LC(UNI_TO_NATIVE(c));
+        return isALPHANUMERIC_LC(c);
     }
     return _is_uni_FOO(_CC_ALPHANUMERIC, c);
 }
@@ -1887,7 +1883,7 @@ bool
 Perl_is_uni_idfirst_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isIDFIRST_LC(UNI_TO_NATIVE(c));
+        return isIDFIRST_LC(c);
     }
     return _is_uni_perl_idstart(c);
 }
@@ -1896,7 +1892,7 @@ bool
 Perl_is_uni_alpha_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isALPHA_LC(UNI_TO_NATIVE(c));
+        return isALPHA_LC(c);
     }
     return _is_uni_FOO(_CC_ALPHA, c);
 }
@@ -1905,7 +1901,7 @@ bool
 Perl_is_uni_ascii_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isASCII_LC(UNI_TO_NATIVE(c));
+        return isASCII_LC(c);
     }
     return 0;
 }
@@ -1914,7 +1910,7 @@ bool
 Perl_is_uni_blank_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isBLANK_LC(UNI_TO_NATIVE(c));
+        return isBLANK_LC(c);
     }
     return isBLANK_uni(c);
 }
@@ -1923,7 +1919,7 @@ bool
 Perl_is_uni_space_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isSPACE_LC(UNI_TO_NATIVE(c));
+        return isSPACE_LC(c);
     }
     return isSPACE_uni(c);
 }
@@ -1932,7 +1928,7 @@ bool
 Perl_is_uni_digit_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isDIGIT_LC(UNI_TO_NATIVE(c));
+        return isDIGIT_LC(c);
     }
     return _is_uni_FOO(_CC_DIGIT, c);
 }
@@ -1941,7 +1937,7 @@ bool
 Perl_is_uni_upper_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isUPPER_LC(UNI_TO_NATIVE(c));
+        return isUPPER_LC(c);
     }
     return _is_uni_FOO(_CC_UPPER, c);
 }
@@ -1950,7 +1946,7 @@ bool
 Perl_is_uni_lower_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isLOWER_LC(UNI_TO_NATIVE(c));
+        return isLOWER_LC(c);
     }
     return _is_uni_FOO(_CC_LOWER, c);
 }
@@ -1959,7 +1955,7 @@ bool
 Perl_is_uni_cntrl_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isCNTRL_LC(UNI_TO_NATIVE(c));
+        return isCNTRL_LC(c);
     }
     return 0;
 }
@@ -1968,7 +1964,7 @@ bool
 Perl_is_uni_graph_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isGRAPH_LC(UNI_TO_NATIVE(c));
+        return isGRAPH_LC(c);
     }
     return _is_uni_FOO(_CC_GRAPH, c);
 }
@@ -1977,7 +1973,7 @@ bool
 Perl_is_uni_print_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isPRINT_LC(UNI_TO_NATIVE(c));
+        return isPRINT_LC(c);
     }
     return _is_uni_FOO(_CC_PRINT, c);
 }
@@ -1986,7 +1982,7 @@ bool
 Perl_is_uni_punct_lc(pTHX_ UV c)
 {
     if (c < 256) {
-        return isPUNCT_LC(UNI_TO_NATIVE(c));
+        return isPUNCT_LC(c);
     }
     return _is_uni_FOO(_CC_PUNCT, c);
 }
@@ -1995,7 +1991,7 @@ bool
 Perl_is_uni_xdigit_lc(pTHX_ UV c)
 {
     if (c < 256) {
-       return isXDIGIT_LC(UNI_TO_NATIVE(c));
+       return isXDIGIT_LC(c);
     }
     return isXDIGIT_uni(c);
 }
@@ -2382,13 +2378,8 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
 			SV **swashp, const char *normal, const char *special)
 {
     dVAR;
-    U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
     STRLEN len = 0;
-    const UV uv0 = valid_utf8_to_uvchr(p, NULL);
-    /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
-     * are necessary in EBCDIC, they are redundant no-ops
-     * in ASCII-ish platforms, and hopefully optimized away. */
-    const UV uv1 = NATIVE_TO_UNI(uv0);
+    const UV uv1 = valid_utf8_to_uvchr(p, NULL);
 
     PERL_ARGS_ASSERT_TO_UTF8_CASE;
 
@@ -2414,8 +2405,6 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
 	 * be given */
     }
 
-    uvuni_to_utf8(tmpbuf, uv1);
-
     if (!*swashp) /* load on-demand */
          *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
 
@@ -2426,56 +2415,26 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
 	 SV **svp;
 
 	 if (hv &&
-	     (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
+	     (svp = hv_fetch(hv, (const char*)p, UNISKIP(uv1), FALSE)) &&
 	     (*svp)) {
 	     const char *s;
 
 	      s = SvPV_const(*svp, len);
 	      if (len == 1)
+                  /* EIGHTBIT */
 		   len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
 	      else {
-#ifdef EBCDIC
-		   /* If we have EBCDIC we need to remap the characters
-		    * since any characters in the low 256 are Unicode
-		    * code points, not EBCDIC. */
-		   U8 *t = (U8*)s, *tend = t + len, *d;
-		
-		   d = tmpbuf;
-		   if (SvUTF8(*svp)) {
-			STRLEN tlen = 0;
-			
-			while (t < tend) {
-			     const UV c = utf8_to_uvchr_buf(t, tend, &tlen);
-			     if (tlen > 0) {
-				  d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
-				  t += tlen;
-			     }
-			     else
-				  break;
-			}
-		   }
-		   else {
-			while (t < tend) {
-			     d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
-			     t++;
-			}
-		   }
-		   len = d - tmpbuf;
-		   Copy(tmpbuf, ustrp, len, U8);
-#else
 		   Copy(s, ustrp, len, U8);
-#endif
 	      }
 	 }
     }
 
     if (!len && *swashp) {
-	const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE /* => is utf8 */);
+	const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is utf8 */);
 
 	 if (uv2) {
 	      /* It was "normal" (a single character mapping). */
-	      const UV uv3 = UNI_TO_NATIVE(uv2);
-	      len = uvchr_to_utf8(ustrp, uv3) - ustrp;
+	      len = uvchr_to_utf8(ustrp, uv2) - ustrp;
 	 }
     }
 
@@ -2496,7 +2455,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
     if (lenp)
 	 *lenp = len;
 
-    return uv0;
+    return uv1;
 
 }
 
@@ -3195,7 +3154,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
     U32 bit;
     SV *swatch;
     U8 tmputf8[2];
-    const UV c = NATIVE_TO_ASCII(*ptr);
+    const UV c = *ptr;
 
     PERL_ARGS_ASSERT_SWASH_FETCH;
 
@@ -3209,7 +3168,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
     }
 
     /* Convert to utf8 if not already */
-    if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
+    if (!do_utf8 && !NATIVE_IS_INVARIANT(c)) {
 	tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
 	tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
 	ptr = tmputf8;
@@ -4605,7 +4564,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c
 		    to_utf8_fold(p1, foldbuf1, &n1);
 		}
 		else {  /* Not utf8, get utf8 fold */
-		    to_uni_fold(NATIVE_TO_LATIN1(*p1), foldbuf1, &n1);
+		    to_uni_fold(*p1, foldbuf1, &n1);
 		}
 		f1 = foldbuf1;
 	    }
@@ -4650,7 +4609,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c
 		    to_utf8_fold(p2, foldbuf2, &n2);
 		}
 		else {
-		    to_uni_fold(NATIVE_TO_LATIN1(*p2), foldbuf2, &n2);
+		    to_uni_fold(*p2, foldbuf2, &n2);
 		}
 		f2 = foldbuf2;
 	    }
author	Karl Williamson <public@khwilliamson.com>	2013-02-17 13:47:13 -0700
committer	Karl Williamson <public@khwilliamson.com>	2013-08-29 09:55:52 -0600
commit	f4cd282cd75f0333d0ce591c83812eed609d5283 (patch)
tree	5b2e6e4c08b6fe67d75a9d93dea211e71988c3ca
parent	94bb8c36d9e11dd4825e43d06f0832f01a7e5045 (diff)
download	perl-f4cd282cd75f0333d0ce591c83812eed609d5283.tar.gz