summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2020-05-25 11:15:01 -0600
committerSawyer X <xsawyerx@cpan.org>2020-05-27 11:09:32 +0300
commit65a97d491a9a6a59c242ed7027d1a284cf7c8d6d (patch)
treecf1b5c30349c8c0eec986a6ec7ebdf40de3d9bcd
parente2d0e9a5d1dd29d4005ca9a3e73222eeabd9e2b7 (diff)
downloadperl-65a97d491a9a6a59c242ed7027d1a284cf7c8d6d.tar.gz
Fix tr/// determination of inplace editing for EBCDIC
I realized as a result of fixing GH #17654, that the code didn't properly decide if a tr/// can be done in-place on EBCDIC platforms. Since we didn't have an EBCDIC smoker at the time, I couldn't be sure that the fix actually worked. Now that we do have a smoker, I have successfully tested it. This patch is constructed so that the code generated on non-EBCDIC platforms should not be changed by it.
-rw-r--r--ebcdic_tables.h110
-rw-r--r--op.c58
-rw-r--r--regen/ebcdic.pl50
3 files changed, 38 insertions, 180 deletions
diff --git a/ebcdic_tables.h b/ebcdic_tables.h
index 9fdcbb6e76..cf1beeb85e 100644
--- a/ebcdic_tables.h
+++ b/ebcdic_tables.h
@@ -413,60 +413,6 @@ SOFTWARE.
};
# endif
-/* This table partitions all the code points of the platform into ranges which
- * have the property that all the code points in each range have the same
- * number of bytes in their UTF-EBCDIC representations, and the adjacent
- * ranges have a different number of bytes.
- *
- * Each number in the table begins such a range, which extends up to just
- * before the following table entry, except the final entry is understood to
- * extend to the platform's infinity
- */
-# ifndef DOINIT
- EXTCONST UV PL_partition_by_byte_length[38];
-# else
- EXTCONST UV PL_partition_by_byte_length[38] = {
- 0x00,
- 0x41,
- 0x4b,
- 0x51,
- 0x5a,
- 0x62,
- 0x6b,
- 0x70,
- 0x79,
- 0x80,
- 0x81,
- 0x8a,
- 0x91,
- 0x9a,
- 0xa1,
- 0xaa,
- 0xad,
- 0xae,
- 0xbd,
- 0xbe,
- 0xc0,
- 0xca,
- 0xd0,
- 0xda,
- 0xe0,
- 0xe1,
- 0xe2,
- 0xea,
- 0xf0,
- 0xfa,
- 0xff,
- 0x100,
- 0x400,
- 0x4000,
- 0x40000,
- 0x400000,
- 0x4000000,
- 0x40000000
-};
-# endif
-
#endif /* EBCDIC 1047 */
#if 'A' == 193 /* EBCDIC 037 */ \
@@ -845,62 +791,6 @@ SOFTWARE.
};
# endif
-/* This table partitions all the code points of the platform into ranges which
- * have the property that all the code points in each range have the same
- * number of bytes in their UTF-EBCDIC representations, and the adjacent
- * ranges have a different number of bytes.
- *
- * Each number in the table begins such a range, which extends up to just
- * before the following table entry, except the final entry is understood to
- * extend to the platform's infinity
- */
-# ifndef DOINIT
- EXTCONST UV PL_partition_by_byte_length[40];
-# else
- EXTCONST UV PL_partition_by_byte_length[40] = {
- 0x00,
- 0x41,
- 0x4b,
- 0x51,
- 0x5a,
- 0x5f,
- 0x60,
- 0x62,
- 0x6b,
- 0x70,
- 0x79,
- 0x80,
- 0x81,
- 0x8a,
- 0x91,
- 0x9a,
- 0xa1,
- 0xaa,
- 0xb0,
- 0xb1,
- 0xba,
- 0xbc,
- 0xc0,
- 0xca,
- 0xd0,
- 0xda,
- 0xe0,
- 0xe1,
- 0xe2,
- 0xea,
- 0xf0,
- 0xfa,
- 0xff,
- 0x100,
- 0x400,
- 0x4000,
- 0x40000,
- 0x400000,
- 0x4000000,
- 0x40000000
-};
-# endif
-
#endif /* EBCDIC 037 */
#endif /* PERL_EBCDIC_TABLES_H_ */
diff --git a/op.c b/op.c
index 135d08e2fc..0ddc710fba 100644
--- a/op.c
+++ b/op.c
@@ -7061,12 +7061,13 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
* these up into smaller chunks, but doesn't merge any together. This
* makes it easy to find the instances it's looking for. A second pass is
* done after this has been determined which merges things together to
- * shrink the table for runtime. For ASCII platforms, the table is
- * trivial, given below, and uses the fundamental characteristics of UTF-8
- * to construct the values. For EBCDIC, it isn't so, and we rely on a
- * table constructed by the perl script that generates these kinds of
- * things */
-#ifndef EBCDIC
+ * shrink the table for runtime. The table below is used for both ASCII
+ * and EBCDIC platforms. On EBCDIC, the byte length is not monotonically
+ * increasing for code points below 256. To correct for that, the macro
+ * CP_ADJUST defined below converts those code points to ASCII in the first
+ * pass, and we use the ASCII partition values. That works because the
+ * growth factor will be unaffected, which is all that is calculated during
+ * the first pass. */
UV PL_partition_by_byte_length[] = {
0,
0x80, /* Below this is 1 byte representations */
@@ -7083,8 +7084,6 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
};
-#endif
-
PERL_ARGS_ASSERT_PMTRANS;
PL_hints |= HINT_BLOCK_SCOPE;
@@ -7212,6 +7211,21 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
t_array = invlist_array(t_invlist);
}
+/* As noted earlier, we convert EBCDIC code points to Unicode in the first pass
+ * so as to get the well-behaved length 1 vs length 2 boundary. Only code
+ * points below 256 differ between the two character sets in this regard. For
+ * these, we also can't have any ranges, as they have to be individually
+ * converted. */
+#ifdef EBCDIC
+# define CP_ADJUST(x) ((pass2) ? (x) : NATIVE_TO_UNI(x))
+# define FORCE_RANGE_LEN_1(x) ((pass2) ? 0 : ((x) < 256))
+# define CP_SKIP(x) ((pass2) ? UVCHR_SKIP(x) : OFFUNISKIP(x))
+#else
+# define CP_ADJUST(x) (x)
+# define FORCE_RANGE_LEN_1(x) 0
+# define CP_SKIP(x) UVCHR_SKIP(x)
+#endif
+
/* And the mapping of each of the ranges is initialized. Initially,
* everything is TR_UNLISTED. */
for (i = 0; i < len; i++) {
@@ -7345,7 +7359,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
/* Here, not in the middle of a range, and not UTF-8. The
* next code point is the single byte where we're at */
- t_cp = *t;
+ t_cp = CP_ADJUST(*t);
t_range_count = 1;
t++;
}
@@ -7356,7 +7370,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
* next code point is the next UTF-8 char in the input. We
* know the input is valid, because the toker constructed
* it */
- t_cp = valid_utf8_to_uvchr(t, &t_char_len);
+ t_cp = CP_ADJUST(valid_utf8_to_uvchr(t, &t_char_len));
t += t_char_len;
/* UTF-8 strings (only) have been parsed in toke.c to have
@@ -7364,7 +7378,9 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
* the first element of a range. If so, get the final
* element and calculate the range size. If not, the range
* size is 1 */
- if (t < tend && *t == RANGE_INDICATOR) {
+ if ( t < tend && *t == RANGE_INDICATOR
+ && ! FORCE_RANGE_LEN_1(t_cp))
+ {
t++;
t_range_count = valid_utf8_to_uvchr(t, &t_char_len)
- t_cp + 1;
@@ -7396,16 +7412,18 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
}
else {
if (! rstr_utf8) {
- r_cp = *r;
+ r_cp = CP_ADJUST(*r);
r_range_count = 1;
r++;
}
else {
Size_t r_char_len;
- r_cp = valid_utf8_to_uvchr(r, &r_char_len);
+ r_cp = CP_ADJUST(valid_utf8_to_uvchr(r, &r_char_len));
r += r_char_len;
- if (r < rend && *r == RANGE_INDICATOR) {
+ if ( r < rend && *r == RANGE_INDICATOR
+ && ! FORCE_RANGE_LEN_1(r_cp))
+ {
r++;
r_range_count = valid_utf8_to_uvchr(r,
&r_char_len) - r_cp + 1;
@@ -7537,7 +7555,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
* code point in the rhs against any code point in the lhs. */
if ( ! pass2
&& r_cp_end != TR_SPECIAL_HANDLING
- && UVCHR_SKIP(t_cp_end) < UVCHR_SKIP(r_cp_end))
+ && CP_SKIP(t_cp_end) < CP_SKIP(r_cp_end))
{
/* Here, we will need to make a copy of the input string
* before doing the transliteration. The worst possible
@@ -7560,8 +7578,8 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
* string not being UTF-8 */
NV t_size = (can_force_utf8 && t_cp < 256)
? 1
- : UVCHR_SKIP(t_cp_end);
- NV ratio = UVCHR_SKIP(r_cp_end) / t_size;
+ : CP_SKIP(t_cp_end);
+ NV ratio = CP_SKIP(r_cp_end) / t_size;
o->op_private |= OPpTRANS_GROWS;
@@ -7594,8 +7612,8 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
* is if it 'grows'. But in the 2nd pass, there's no
* reason to not merge */
if ( (i > 0 && ( pass2
- || UVCHR_SKIP(t_array[i-1])
- == UVCHR_SKIP(t_cp)))
+ || CP_SKIP(t_array[i-1])
+ == CP_SKIP(t_cp)))
&& ( ( r_cp == TR_SPECIAL_HANDLING
&& r_map[i-1] == TR_SPECIAL_HANDLING)
|| ( r_cp != TR_SPECIAL_HANDLING
@@ -7615,7 +7633,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
adjacent_to_range_above = TRUE;
if (i + 1 < len)
if ( ( pass2
- || UVCHR_SKIP(t_cp) == UVCHR_SKIP(t_array[i+1]))
+ || CP_SKIP(t_cp) == CP_SKIP(t_array[i+1]))
&& ( ( r_cp == TR_SPECIAL_HANDLING
&& r_map[i+1] == (UV) TR_SPECIAL_HANDLING)
|| ( r_cp != TR_SPECIAL_HANDLING
diff --git a/regen/ebcdic.pl b/regen/ebcdic.pl
index cacf732b0c..863e9b9adc 100644
--- a/regen/ebcdic.pl
+++ b/regen/ebcdic.pl
@@ -779,56 +779,6 @@ END
output_table(\@C9_utf8_dfa, "PL_c9_utf8_dfa_tab", $NUM_CLASSES);
}
- {
- print $out_fh <<EOF;
-/* This table partitions all the code points of the platform into ranges which
- * have the property that all the code points in each range have the same
- * number of bytes in their UTF-EBCDIC representations, and the adjacent
- * ranges have a different number of bytes.
- *
- * Each number in the table begins such a range, which extends up to just
- * before the following table entry, except the final entry is understood to
- * extend to the platform's infinity
- */
-EOF
- # The lengths of the characters between 0 and 255 are either 1 or 2,
- # with those whose ASCII platform equivalents below 160 being 1, and
- # the rest being 2.
- my @list;
- push @list, 0;
- my $pushed_range_is_length_1 = 1;
-
- for my $i (1 .. 0xFF) {
- my $this_code_point_is_length_1 = ($e2a[$i] < 160);
- if ($pushed_range_is_length_1 != $this_code_point_is_length_1) {
- push @list, $i;
- $pushed_range_is_length_1 = $this_code_point_is_length_1;
- }
- }
-
- # Starting at 256, the length is 2.
- push @list, 0x100 if $pushed_range_is_length_1;
-
- # These are based on the fundamental properties of UTF-EBCDIC. Each
- # continuation byte has 5 bits of information. Comments in utf8.h
- # explain the rest.
- my $UTF_ACCUMULATION_SHIFT = 5;
- push @list, (32 * (1 << ( $UTF_ACCUMULATION_SHIFT)));
- push @list, (16 * (1 << (2 * $UTF_ACCUMULATION_SHIFT)));
- push @list, ( 8 * (1 << (3 * $UTF_ACCUMULATION_SHIFT)));
- push @list, ( 4 * (1 << (4 * $UTF_ACCUMULATION_SHIFT)));
- push @list, ( 2 * (1 << (5 * $UTF_ACCUMULATION_SHIFT)));
- push @list, ( (1 << (6 * $UTF_ACCUMULATION_SHIFT)));
-
- output_table_start($out_fh, "UV", "PL_partition_by_byte_length", scalar @list);
- print $out_fh "\t";
-
- print $out_fh join ",\n\t", map { sprintf "0x%02x", $_ } @list;
- print $out_fh "\n";
-
- output_table_end($out_fh);
- }
-
print $out_fh get_conditional_compile_line_end();
}