diff options
author | Inaba Hiroto <inaba@st.rim.or.jp> | 2003-02-02 06:58:20 +0900 |
---|---|---|
committer | Jarkko Hietaniemi <jhi@iki.fi> | 2003-02-05 17:15:35 +0000 |
commit | 220e2d4e806a3c9c0a9f9b31667e8be830cbc55b (patch) | |
tree | 28905af26db33e5aba4ebee2d01aafaecdd67261 /toke.c | |
parent | 6efa47225d75a7e88a5d03044ee492ff6e1b7ee0 (diff) | |
download | perl-220e2d4e806a3c9c0a9f9b31667e8be830cbc55b.tar.gz |
[Patch] parsing under encoding (Re: [Encode] HEADS-UP; $Encode::VERSION++ to enhance filter option)([perl #16823])
Message-ID: <3E3BC46B.6C687CFD@st.rim.or.jp>
p4raw-id: //depot/perl@18660
Diffstat (limited to 'toke.c')
-rw-r--r-- | toke.c | 116 |
1 files changed, 102 insertions, 14 deletions
@@ -6882,6 +6882,10 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) register char *to; /* current position in the sv's data */ I32 brackets = 1; /* bracket nesting level */ bool has_utf8 = FALSE; /* is there any utf8 content? */ + I32 termcode; /* terminating char. code */ + U8 termstr[UTF8_MAXLEN]; /* terminating string */ + STRLEN termlen; /* length of terminating string */ + char *last = NULL; /* last position for nesting bracket */ /* skip space before the delimiter */ if (isSPACE(*s)) @@ -6892,8 +6896,16 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) /* after skipping whitespace, the next character is the terminator */ term = *s; - if (!UTF8_IS_INVARIANT((U8)term) && UTF) - has_utf8 = TRUE; + if (!UTF) { + termcode = termstr[0] = term; + termlen = 1; + } + else { + termcode = utf8_to_uvchr(s, &termlen); + Copy(s, termstr, termlen, U8); + if (!UTF8_IS_INVARIANT(term)) + has_utf8 = TRUE; + } /* mark where we are */ PL_multi_start = CopLINE(PL_curcop); @@ -6901,21 +6913,92 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) /* find corresponding closing delimiter */ if (term && (tmps = strchr("([{< )]}> )]}>",term))) - term = tmps[5]; + termcode = termstr[0] = term = tmps[5]; + PL_multi_close = term; /* create a new SV to hold the contents. 87 is leak category, I'm assuming. 79 is the SV's initial length. What a random number. */ sv = NEWSV(87,79); sv_upgrade(sv, SVt_PVIV); - SvIVX(sv) = term; + SvIVX(sv) = termcode; (void)SvPOK_only(sv); /* validate pointer */ /* move past delimiter and try to read a complete string */ if (keep_delims) - sv_catpvn(sv, s, 1); - s++; + sv_catpvn(sv, s, termlen); + s += termlen; for (;;) { + if (PL_encoding && !UTF) { + bool cont = TRUE; + + while (cont) { + int offset = s - SvPVX(PL_linestr); + bool found = sv_cat_decode(sv, PL_encoding, PL_linestr, + &offset, termstr, termlen); + char *ns = SvPVX(PL_linestr) + offset; + char *svlast = SvEND(sv) - 1; + + for (; s < ns; s++) { + if (*s == '\n' && !PL_rsfp) + CopLINE_inc(PL_curcop); + } + if (!found) + goto read_more_line; + else { + /* handle quoted delimiters */ + if (*(svlast-1) == '\\') { + char *t; + for (t = svlast-2; t >= SvPVX(sv) && *t == '\\';) + t--; + if ((svlast-1 - t) % 2) { + if (!keep_quoted) { + *(svlast-1) = term; + *svlast = '\0'; + SvCUR_set(sv, SvCUR(sv) - 1); + } + continue; + } + } + if (PL_multi_open == PL_multi_close) { + cont = FALSE; + } + else { + char *t, *w; + if (!last) + last = SvPVX(sv); + for (w = t = last; t < svlast; w++, t++) { + /* At here, all closes are "was quoted" one, + so we don't check PL_multi_close. */ + if (*t == '\\') { + if (!keep_quoted && *(t+1) == PL_multi_open) + t++; + else + *w++ = *t++; + } + else if (*t == PL_multi_open) + brackets++; + + *w = *t; + } + if (w < t) { + *w++ = term; + *w = '\0'; + SvCUR_set(sv, w - SvPVX(sv)); + } + last = w; + if (--brackets <= 0) + cont = FALSE; + } + } + } + if (!keep_delims) { + SvCUR_set(sv, SvCUR(sv) - 1); + *SvEND(sv) = '\0'; + } + break; + } + /* extend sv if need be */ SvGROW(sv, SvCUR(sv) + (PL_bufend - s) + 1); /* set 'to' to the next character in the sv's string */ @@ -6937,8 +7020,12 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) } /* terminate when run out of buffer (the for() condition), or have found the terminator */ - else if (*s == term) - break; + else if (*s == term) { + if (termlen == 1) + break; + if (s+termlen <= PL_bufend && memEQ(s, termstr, termlen)) + break; + } else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF) has_utf8 = TRUE; *to = *s; @@ -7000,6 +7087,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) to[-1] = '\n'; #endif + read_more_line: /* if we're out of file, or a read fails, bail and reset the current line marker so we can report where the unterminated string began */ @@ -7030,15 +7118,15 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) /* at this point, we have successfully read the delimited string */ - if (keep_delims) - sv_catpvn(sv, s, 1); - if (has_utf8) + if (!PL_encoding || UTF) { + if (keep_delims) + sv_catpvn(sv, s, termlen); + s += termlen; + } + if (has_utf8 || PL_encoding) SvUTF8_on(sv); - else if (PL_encoding) - sv_recode_to_utf8(sv, PL_encoding); PL_multi_end = CopLINE(PL_curcop); - s++; /* if we allocated too much space, give some back */ if (SvCUR(sv) + 5 < SvLEN(sv)) { |