diff options
author | Karl Williamson <khw@cpan.org> | 2020-05-25 11:15:01 -0600 |
---|---|---|
committer | Sawyer X <xsawyerx@cpan.org> | 2020-05-27 11:09:32 +0300 |
commit | 65a97d491a9a6a59c242ed7027d1a284cf7c8d6d (patch) | |
tree | cf1b5c30349c8c0eec986a6ec7ebdf40de3d9bcd | |
parent | e2d0e9a5d1dd29d4005ca9a3e73222eeabd9e2b7 (diff) | |
download | perl-65a97d491a9a6a59c242ed7027d1a284cf7c8d6d.tar.gz |
Fix tr/// determination of inplace editing for EBCDIC
I realized as a result of fixing GH #17654, that the code didn't
properly decide if a tr/// can be done in-place on EBCDIC platforms.
Since we didn't have an EBCDIC smoker at the time, I couldn't be sure
that the fix actually worked. Now that we do have a smoker, I have
successfully tested it.
This patch is constructed so that the code generated on non-EBCDIC
platforms should not be changed by it.
-rw-r--r-- | ebcdic_tables.h | 110 | ||||
-rw-r--r-- | op.c | 58 | ||||
-rw-r--r-- | regen/ebcdic.pl | 50 |
3 files changed, 38 insertions, 180 deletions
diff --git a/ebcdic_tables.h b/ebcdic_tables.h index 9fdcbb6e76..cf1beeb85e 100644 --- a/ebcdic_tables.h +++ b/ebcdic_tables.h @@ -413,60 +413,6 @@ SOFTWARE. }; # endif -/* This table partitions all the code points of the platform into ranges which - * have the property that all the code points in each range have the same - * number of bytes in their UTF-EBCDIC representations, and the adjacent - * ranges have a different number of bytes. - * - * Each number in the table begins such a range, which extends up to just - * before the following table entry, except the final entry is understood to - * extend to the platform's infinity - */ -# ifndef DOINIT - EXTCONST UV PL_partition_by_byte_length[38]; -# else - EXTCONST UV PL_partition_by_byte_length[38] = { - 0x00, - 0x41, - 0x4b, - 0x51, - 0x5a, - 0x62, - 0x6b, - 0x70, - 0x79, - 0x80, - 0x81, - 0x8a, - 0x91, - 0x9a, - 0xa1, - 0xaa, - 0xad, - 0xae, - 0xbd, - 0xbe, - 0xc0, - 0xca, - 0xd0, - 0xda, - 0xe0, - 0xe1, - 0xe2, - 0xea, - 0xf0, - 0xfa, - 0xff, - 0x100, - 0x400, - 0x4000, - 0x40000, - 0x400000, - 0x4000000, - 0x40000000 -}; -# endif - #endif /* EBCDIC 1047 */ #if 'A' == 193 /* EBCDIC 037 */ \ @@ -845,62 +791,6 @@ SOFTWARE. }; # endif -/* This table partitions all the code points of the platform into ranges which - * have the property that all the code points in each range have the same - * number of bytes in their UTF-EBCDIC representations, and the adjacent - * ranges have a different number of bytes. - * - * Each number in the table begins such a range, which extends up to just - * before the following table entry, except the final entry is understood to - * extend to the platform's infinity - */ -# ifndef DOINIT - EXTCONST UV PL_partition_by_byte_length[40]; -# else - EXTCONST UV PL_partition_by_byte_length[40] = { - 0x00, - 0x41, - 0x4b, - 0x51, - 0x5a, - 0x5f, - 0x60, - 0x62, - 0x6b, - 0x70, - 0x79, - 0x80, - 0x81, - 0x8a, - 0x91, - 0x9a, - 0xa1, - 0xaa, - 0xb0, - 0xb1, - 0xba, - 0xbc, - 0xc0, - 0xca, - 0xd0, - 0xda, - 0xe0, - 0xe1, - 0xe2, - 0xea, - 0xf0, - 0xfa, - 0xff, - 0x100, - 0x400, - 0x4000, - 0x40000, - 0x400000, - 0x4000000, - 0x40000000 -}; -# endif - #endif /* EBCDIC 037 */ #endif /* PERL_EBCDIC_TABLES_H_ */ @@ -7061,12 +7061,13 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) * these up into smaller chunks, but doesn't merge any together. This * makes it easy to find the instances it's looking for. A second pass is * done after this has been determined which merges things together to - * shrink the table for runtime. For ASCII platforms, the table is - * trivial, given below, and uses the fundamental characteristics of UTF-8 - * to construct the values. For EBCDIC, it isn't so, and we rely on a - * table constructed by the perl script that generates these kinds of - * things */ -#ifndef EBCDIC + * shrink the table for runtime. The table below is used for both ASCII + * and EBCDIC platforms. On EBCDIC, the byte length is not monotonically + * increasing for code points below 256. To correct for that, the macro + * CP_ADJUST defined below converts those code points to ASCII in the first + * pass, and we use the ASCII partition values. That works because the + * growth factor will be unaffected, which is all that is calculated during + * the first pass. */ UV PL_partition_by_byte_length[] = { 0, 0x80, /* Below this is 1 byte representations */ @@ -7083,8 +7084,6 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) }; -#endif - PERL_ARGS_ASSERT_PMTRANS; PL_hints |= HINT_BLOCK_SCOPE; @@ -7212,6 +7211,21 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) t_array = invlist_array(t_invlist); } +/* As noted earlier, we convert EBCDIC code points to Unicode in the first pass + * so as to get the well-behaved length 1 vs length 2 boundary. Only code + * points below 256 differ between the two character sets in this regard. For + * these, we also can't have any ranges, as they have to be individually + * converted. */ +#ifdef EBCDIC +# define CP_ADJUST(x) ((pass2) ? (x) : NATIVE_TO_UNI(x)) +# define FORCE_RANGE_LEN_1(x) ((pass2) ? 0 : ((x) < 256)) +# define CP_SKIP(x) ((pass2) ? UVCHR_SKIP(x) : OFFUNISKIP(x)) +#else +# define CP_ADJUST(x) (x) +# define FORCE_RANGE_LEN_1(x) 0 +# define CP_SKIP(x) UVCHR_SKIP(x) +#endif + /* And the mapping of each of the ranges is initialized. Initially, * everything is TR_UNLISTED. */ for (i = 0; i < len; i++) { @@ -7345,7 +7359,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) /* Here, not in the middle of a range, and not UTF-8. The * next code point is the single byte where we're at */ - t_cp = *t; + t_cp = CP_ADJUST(*t); t_range_count = 1; t++; } @@ -7356,7 +7370,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) * next code point is the next UTF-8 char in the input. We * know the input is valid, because the toker constructed * it */ - t_cp = valid_utf8_to_uvchr(t, &t_char_len); + t_cp = CP_ADJUST(valid_utf8_to_uvchr(t, &t_char_len)); t += t_char_len; /* UTF-8 strings (only) have been parsed in toke.c to have @@ -7364,7 +7378,9 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) * the first element of a range. If so, get the final * element and calculate the range size. If not, the range * size is 1 */ - if (t < tend && *t == RANGE_INDICATOR) { + if ( t < tend && *t == RANGE_INDICATOR + && ! FORCE_RANGE_LEN_1(t_cp)) + { t++; t_range_count = valid_utf8_to_uvchr(t, &t_char_len) - t_cp + 1; @@ -7396,16 +7412,18 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) } else { if (! rstr_utf8) { - r_cp = *r; + r_cp = CP_ADJUST(*r); r_range_count = 1; r++; } else { Size_t r_char_len; - r_cp = valid_utf8_to_uvchr(r, &r_char_len); + r_cp = CP_ADJUST(valid_utf8_to_uvchr(r, &r_char_len)); r += r_char_len; - if (r < rend && *r == RANGE_INDICATOR) { + if ( r < rend && *r == RANGE_INDICATOR + && ! FORCE_RANGE_LEN_1(r_cp)) + { r++; r_range_count = valid_utf8_to_uvchr(r, &r_char_len) - r_cp + 1; @@ -7537,7 +7555,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) * code point in the rhs against any code point in the lhs. */ if ( ! pass2 && r_cp_end != TR_SPECIAL_HANDLING - && UVCHR_SKIP(t_cp_end) < UVCHR_SKIP(r_cp_end)) + && CP_SKIP(t_cp_end) < CP_SKIP(r_cp_end)) { /* Here, we will need to make a copy of the input string * before doing the transliteration. The worst possible @@ -7560,8 +7578,8 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) * string not being UTF-8 */ NV t_size = (can_force_utf8 && t_cp < 256) ? 1 - : UVCHR_SKIP(t_cp_end); - NV ratio = UVCHR_SKIP(r_cp_end) / t_size; + : CP_SKIP(t_cp_end); + NV ratio = CP_SKIP(r_cp_end) / t_size; o->op_private |= OPpTRANS_GROWS; @@ -7594,8 +7612,8 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) * is if it 'grows'. But in the 2nd pass, there's no * reason to not merge */ if ( (i > 0 && ( pass2 - || UVCHR_SKIP(t_array[i-1]) - == UVCHR_SKIP(t_cp))) + || CP_SKIP(t_array[i-1]) + == CP_SKIP(t_cp))) && ( ( r_cp == TR_SPECIAL_HANDLING && r_map[i-1] == TR_SPECIAL_HANDLING) || ( r_cp != TR_SPECIAL_HANDLING @@ -7615,7 +7633,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) adjacent_to_range_above = TRUE; if (i + 1 < len) if ( ( pass2 - || UVCHR_SKIP(t_cp) == UVCHR_SKIP(t_array[i+1])) + || CP_SKIP(t_cp) == CP_SKIP(t_array[i+1])) && ( ( r_cp == TR_SPECIAL_HANDLING && r_map[i+1] == (UV) TR_SPECIAL_HANDLING) || ( r_cp != TR_SPECIAL_HANDLING diff --git a/regen/ebcdic.pl b/regen/ebcdic.pl index cacf732b0c..863e9b9adc 100644 --- a/regen/ebcdic.pl +++ b/regen/ebcdic.pl @@ -779,56 +779,6 @@ END output_table(\@C9_utf8_dfa, "PL_c9_utf8_dfa_tab", $NUM_CLASSES); } - { - print $out_fh <<EOF; -/* This table partitions all the code points of the platform into ranges which - * have the property that all the code points in each range have the same - * number of bytes in their UTF-EBCDIC representations, and the adjacent - * ranges have a different number of bytes. - * - * Each number in the table begins such a range, which extends up to just - * before the following table entry, except the final entry is understood to - * extend to the platform's infinity - */ -EOF - # The lengths of the characters between 0 and 255 are either 1 or 2, - # with those whose ASCII platform equivalents below 160 being 1, and - # the rest being 2. - my @list; - push @list, 0; - my $pushed_range_is_length_1 = 1; - - for my $i (1 .. 0xFF) { - my $this_code_point_is_length_1 = ($e2a[$i] < 160); - if ($pushed_range_is_length_1 != $this_code_point_is_length_1) { - push @list, $i; - $pushed_range_is_length_1 = $this_code_point_is_length_1; - } - } - - # Starting at 256, the length is 2. - push @list, 0x100 if $pushed_range_is_length_1; - - # These are based on the fundamental properties of UTF-EBCDIC. Each - # continuation byte has 5 bits of information. Comments in utf8.h - # explain the rest. - my $UTF_ACCUMULATION_SHIFT = 5; - push @list, (32 * (1 << ( $UTF_ACCUMULATION_SHIFT))); - push @list, (16 * (1 << (2 * $UTF_ACCUMULATION_SHIFT))); - push @list, ( 8 * (1 << (3 * $UTF_ACCUMULATION_SHIFT))); - push @list, ( 4 * (1 << (4 * $UTF_ACCUMULATION_SHIFT))); - push @list, ( 2 * (1 << (5 * $UTF_ACCUMULATION_SHIFT))); - push @list, ( (1 << (6 * $UTF_ACCUMULATION_SHIFT))); - - output_table_start($out_fh, "UV", "PL_partition_by_byte_length", scalar @list); - print $out_fh "\t"; - - print $out_fh join ",\n\t", map { sprintf "0x%02x", $_ } @list; - print $out_fh "\n"; - - output_table_end($out_fh); - } - print $out_fh get_conditional_compile_line_end(); } |