summaryrefslogtreecommitdiff
path: root/toke.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2022-02-12 21:47:13 -0700
committerKarl Williamson <khw@cpan.org>2022-03-19 23:17:51 -0600
commit32b87797e986f5d99836e16ea6b9d9ff5a56d3be (patch)
treee8a1ebb243554bf3901d683c0c9271c4ebcdf1e7 /toke.c
parent908ddb65a3d72cfc75ac23c08a1db5e52e22e688 (diff)
downloadperl-32b87797e986f5d99836e16ea6b9d9ff5a56d3be.tar.gz
toke.c: merge loops, multi-byte delim
The code in toke.c had two closely related loops; one for unmirrored delimiters (same on both ends of the string) that could take UTF-8 delimiters, and one that allowed a mirrored closing delimiter, which could take only a single byte. I found that it was just easiest to collapse these into one loop in preparation to allow multi-byte mirroring.
Diffstat (limited to 'toke.c')
-rw-r--r--toke.c146
1 files changed, 75 insertions, 71 deletions
diff --git a/toke.c b/toke.c
index 5fb5acac0b..8f12cf9a6c 100644
--- a/toke.c
+++ b/toke.c
@@ -11324,7 +11324,8 @@ Perl_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int
bool d_is_utf8 = FALSE; /* is there any utf8 content? */
UV open_delim_code; /* code point */
UV close_delim_code; /* code point */
- U8 close_delim_str[UTF8_MAXBYTES+1];
+ char open_delim_str[UTF8_MAXBYTES+1];
+ char close_delim_str[UTF8_MAXBYTES+1];
char close_delim_byte0;
STRLEN delim_byte_len; /* each delimiter currently is the same number
of bytes */
@@ -11352,21 +11353,26 @@ Perl_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int
/* after skipping whitespace, the next character is the delimiter */
close_delim_byte0 = *s;
if (!UTF || UTF8_IS_INVARIANT(close_delim_byte0)) {
- open_delim_code = close_delim_code = close_delim_str[0] = close_delim_byte0;
+ close_delim_str[0] = close_delim_byte0;
+ open_delim_str[0] = close_delim_str[0];
+
+ close_delim_code = (U8) close_delim_str[0];
+ open_delim_code = close_delim_code;
delim_byte_len = 1;
}
else {
open_delim_code = close_delim_code =
utf8_to_uvchr_buf((U8*)s, (U8*)PL_bufend, &delim_byte_len);
if (UTF && UNLIKELY(! is_grapheme((U8 *) start,
- (U8 *) s,
- (U8 *) PL_bufend,
- open_delim_code)))
+ (U8 *) s,
+ (U8 *) PL_bufend,
+ open_delim_code)))
{
yyerror(non_grapheme_msg);
}
- Copy(s, close_delim_str, delim_byte_len, U8);
+ Copy(s, open_delim_str, delim_byte_len, char);
+ Copy(s, close_delim_str, delim_byte_len, char);
}
/* mark where we are */
@@ -11376,7 +11382,8 @@ Perl_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int
/* If the delimiter has a mirror-image closing one, get it */
if (close_delim_byte0 && (tmps = strchr(opening_delims, close_delim_byte0))) {
- close_delim_code = close_delim_str[0] = close_delim_byte0 = closing_delims[tmps - opening_delims];
+ close_delim_str[0] = close_delim_byte0 = closing_delims[tmps - opening_delims];
+ close_delim_code = (U8) close_delim_str[0];
}
PL_multi_close = close_delim_code;
@@ -11402,83 +11409,80 @@ Perl_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int
/* set 'to' to the next character in the sv's string */
to = SvPVX(sv)+SvCUR(sv);
- /* if open delimiter is the close delimiter read unbridle */
- if (PL_multi_open == PL_multi_close) {
- for (; s < PL_bufend; s++,to++) {
- /* embedded newlines increment the current line number */
- if (*s == '\n' && !PL_rsfp && !PL_parser->filtered)
- COPLINE_INC_WITH_HERELINES;
- /* handle quoted delimiters */
- if (*s == '\\' && s+1 < PL_bufend && close_delim_byte0 != '\\') {
- if (!keep_bracketed_quoted
- && (s[1] == close_delim_byte0
- || (re_reparse && s[1] == '\\'))
- )
- s++;
- else /* any other quotes are simply copied straight through */
- *to++ = *s++;
- }
- /* terminate when run out of buffer (the for() condition), or
- have found the closing delimiter */
- else if (*s == close_delim_byte0) { /* First byte matches */
- if (delim_byte_len == 1) /* If is the only byte, are done */
- break;
-
- /* If the remainder of the closing delimiter matches, also
- * are done, after checking that is a separate grapheme */
- if ( s + delim_byte_len <= PL_bufend
- && memEQ(s + 1, (char*)close_delim_str + 1, delim_byte_len - 1))
- {
- if ( UTF
- && UNLIKELY(! is_grapheme((U8 *) start,
- (U8 *) s,
- (U8 *) PL_bufend,
- close_delim_code)))
- {
- yyerror(non_grapheme_msg);
- }
- break;
- }
- }
- else if (! UTF8_IS_INVARIANT((U8)*s) && UTF) {
- d_is_utf8 = TRUE;
- }
-
- *to = *s;
- }
- }
-
- /* if the closing delimiter isn't the same as the start character (e.g.,
- matched brackets), we have to allow more in the quoting, and
- be prepared for nested brackets.
- */
- else {
/* read until we run out of string, or we find the closing delimiter */
- for (; s < PL_bufend; s++,to++) {
+ while (s < PL_bufend) {
/* embedded newlines increment the line count */
if (*s == '\n' && !PL_rsfp && !PL_parser->filtered)
COPLINE_INC_WITH_HERELINES;
- /* backslashes can escape the open or closing characters */
- if (*s == '\\' && s+1 < PL_bufend) {
- if (!keep_bracketed_quoted
- && ( ((UV)s[1] == PL_multi_open)
- || ((UV)s[1] == PL_multi_close) ))
+
+ /* backslashes can escape the closing delimiter */
+ if ( *s == '\\' && s < PL_bufend - delim_byte_len
+
+ /* ... but not if the delimiter itself is a backslash */
+ && close_delim_byte0 != '\\')
+ {
+ /* Here, we have an escaping backslash. If we're supposed to
+ * discard those that escape the closing delimiter, just
+ * discard this one */
+ if ( ! keep_bracketed_quoted
+ && ( memEQ(s + 1, open_delim_str, delim_byte_len)
+ || ( PL_multi_open == PL_multi_close
+ && re_reparse && s[1] == '\\')
+ || memEQ(s + 1, close_delim_str, delim_byte_len)))
{
s++;
}
- else
+ else /* any other escapes are simply copied straight through */
*to++ = *s++;
}
- /* allow nested opens and closes */
- else if (*(U8 *) s == PL_multi_close && --brackets <= 0)
+ else if ( s < PL_bufend - (delim_byte_len - 1)
+ && memEQ(s, close_delim_str, delim_byte_len)
+ && --brackets <= 0)
+ {
+ /* Found unescaped closing delimiter, unnested if we care about
+ * that; so are done.
+ *
+ * In the case of the opening and closing delimiters being
+ * different, we have to deal with nesting; the conditional
+ * above makes sure we don't get here until the nesting level,
+ * 'brackets', is back down to zero. In the other case,
+ * nesting isn't an issue, and 'brackets' never can get
+ * incremented above 0, so will come here at the first closing
+ * delimiter.
+ *
+ * Only grapheme delimiters are legal. */
+ if ( UTF /* All Non-UTF-8's are graphemes */
+ && UNLIKELY(! is_grapheme((U8 *) start,
+ (U8 *) s,
+ (U8 *) PL_bufend,
+ close_delim_code)))
+ {
+ yyerror(non_grapheme_msg);
+ }
+
break;
- else if (*(U8 *) s == PL_multi_open)
+ }
+ /* No nesting if open eq close */
+ else if ( PL_multi_open != PL_multi_close
+ && s < PL_bufend - (delim_byte_len - 1)
+ && memEQ(s, open_delim_str, delim_byte_len))
+ {
brackets++;
- else if (! UTF8_IS_INVARIANT((U8)*s) && UTF)
+ }
+
+ if (UTF && ! UTF8_IS_INVARIANT((U8) *s)) {
+ size_t this_char_len = UTF8SKIP(s);
+ Copy(s, to, this_char_len, char);
+ s += this_char_len;
+ to += this_char_len;
+
d_is_utf8 = TRUE;
- *to = *s;
- }
+ }
+ else {
+ *to++ = *s++;
+ }
}
+
/* terminate the copied string and update the sv's end-of-string */
*to = '\0';
SvCUR_set(sv, to - SvPVX_const(sv));