summaryrefslogtreecommitdiff
path: root/toke.c
diff options
context:
space:
mode:
Diffstat (limited to 'toke.c')
-rw-r--r--toke.c116
1 files changed, 86 insertions, 30 deletions
diff --git a/toke.c b/toke.c
index b54b368558..3f13f76a8b 100644
--- a/toke.c
+++ b/toke.c
@@ -2994,11 +2994,12 @@ S_scan_const(pTHX_ char *start)
* (the range's maximum end point) before 'd' begins.
*/
char * max_ptr = SvPVX(sv) + offset_to_max;
- const char * min_ptr;
+ char * min_ptr;
IV range_min;
IV range_max; /* last character in range */
- STRLEN save_offset;
STRLEN grow;
+ Size_t offset_to_min = 0;
+ Size_t extras = 0;
#ifdef EBCDIC
bool convert_unicode;
IV real_range_max = 0;
@@ -3065,7 +3066,6 @@ S_scan_const(pTHX_ char *start)
range_max = UNI_TO_NATIVE(range_max);
}
#endif
-
/* Use the characters themselves for the error message if
* ASCII printables; otherwise some visible representation
* of them */
@@ -3140,43 +3140,85 @@ S_scan_const(pTHX_ char *start)
}
/* Here we need to expand out the string to contain each
- * character in the range. Grow the output to handle this */
+ * character in the range. Grow the output to handle this.
+ * For non-UTF8, we need a byte for each code point in the
+ * range, minus the three that we've already allocated for: the
+ * hyphen, the min, and the max. For UTF-8, we need this
+ * plus an extra byte for each code point that occupies two
+ * bytes (is variant) when in UTF-8 (except we've already
+ * allocated for the end points, including if they are
+ * variants). For ASCII platforms and Unicode ranges on EBCDIC
+ * platforms, it's easy to calculate a precise number. To
+ * start, we count the variants in the range, which we need
+ * elsewhere in this function anyway. (For the case where it
+ * isn't easy to calculate, 'extras' has been initialized to 0,
+ * and the calculation is done in a loop further down.) */
+#ifdef EBCDIC
+ if (convert_unicode)
+#endif
+ {
+ /* This is executed unconditionally on ASCII, and for
+ * Unicode ranges on EBCDIC. Under these conditions, all
+ * code points above a certain value are variant; and none
+ * under that value are. We just need to find out how much
+ * of the range is above that value. We don't count the
+ * end points here, as they will already have been counted
+ * as they were parsed. */
+ if (range_min >= UTF_CONTINUATION_MARK) {
+
+ /* The whole range is made up of variants */
+ extras = (range_max - 1) - (range_min + 1) + 1;
+ }
+ else if (range_max >= UTF_CONTINUATION_MARK) {
- save_offset = min_ptr - SvPVX_const(sv);
+ /* Only the higher portion of the range is variants */
+ extras = (range_max - 1) - UTF_CONTINUATION_MARK + 1;
+ }
- /* The base growth is the number of code points in the range */
- grow = range_max - range_min + 1;
- if (has_utf8) {
+ utf8_variant_count += extras;
+ }
+
+ /* The base growth is the number of code points in the range,
+ * not including the endpoints, which have already been sized
+ * for (and output). We don't subtract for the hyphen, as it
+ * has been parsed but not output, and the SvGROW below is
+ * based only on what's been output plus what's left to parse.
+ * */
+ grow = (range_max - 1) - (range_min + 1) + 1;
- /* But if the output is UTF-8, some of those characters may
- * need two bytes (since the maximum range value here is
- * 255, the max bytes per character is two). On ASCII
- * platforms, it's not much trouble to get an accurate
- * count of what's needed. But on EBCDIC, the ones that
- * need 2 bytes are scattered around, so just use a worst
- * case value instead of calculating for that platform. */
+ if (has_utf8) {
#ifdef EBCDIC
- grow *= 2;
-#else
- /* Only those above 127 require 2 bytes. This may be
- * everything in the range, or not */
- if (range_min > 127) {
+ /* In some cases in EBCDIC, we haven't yet calculated a
+ * precise amount needed for the UTF-8 variants. Just
+ * assume the worst case, that everything will expand by a
+ * byte */
+ if (! convert_unicode) {
grow *= 2;
}
- else if (range_max > 127) {
- grow += range_max - 127;
- }
+ else
#endif
+ {
+ /* Otherwise we know exactly how many variants there
+ * are in the range. */
+ grow += extras;
+ }
}
- /* Subtract 3 for the bytes that were already accounted for
- * (min, max, and the hyphen) */
- d = save_offset + SvGROW(sv, SvLEN(sv) + grow - 3);
+ /* Grow, but position the output to overwrite the range min end
+ * point, because in some cases we overwrite that */
+ SvCUR_set(sv, d - SvPVX_const(sv));
+ offset_to_min = min_ptr - SvPVX_const(sv);
+
+ /* See Note on sizing above. */
+ d = offset_to_min + SvGROW(sv, SvCUR(sv)
+ + (send - s)
+ + grow
+ + 1 /* Trailing NUL */ );
+ /* Now, we can expand out the range. */
#ifdef EBCDIC
- /* Here, we expand out the range. */
if (convert_unicode) {
- IV i;
+ SSize_t i;
/* Recall that the min and max are now in Unicode terms, so
* we have to convert each character to its native
@@ -3198,7 +3240,7 @@ S_scan_const(pTHX_ char *start)
#endif
/* Always gets run for ASCII, and sometimes for EBCDIC. */
{
- IV i;
+ SSize_t i;
/* Here, no conversions are necessary, which means that the
* first character in the range is already in 'd' and
@@ -3211,9 +3253,23 @@ S_scan_const(pTHX_ char *start)
}
else {
d++;
- for (i = range_min + 1; i <= range_max; i++) {
+ assert(range_min + 1 <= range_max);
+ for (i = range_min + 1; i < range_max; i++) {
+#ifdef EBCDIC
+ /* In this case on EBCDIC, we haven't calculated
+ * the variants. Do it here, as we go along */
+ if (! UVCHR_IS_INVARIANT(i)) {
+ utf8_variant_count++;
+ }
+#endif
*d++ = (char)i;
}
+
+ /* The range_max is done outside the loop so as to
+ * avoid having to special case not incrementing
+ * 'utf8_variant_count' on EBCDIC (it's already been
+ * counted when originally parsed) */
+ *d++ = (char) range_max;
}
}