summaryrefslogtreecommitdiff
path: root/sv.c
diff options
context:
space:
mode:
authorYves Orton <demerphq@gmail.com>2014-05-12 00:35:19 +0200
committerYves Orton <demerphq@gmail.com>2014-05-12 08:51:10 +0200
commit75be6ec825e310191f10a860f1dfbb74b835c2fa (patch)
tree89248b29d359cade2b59fa87e46031000167e04d /sv.c
parente8c6a474e88610b73e62a19256dc8706b42f42b9 (diff)
downloadperl-75be6ec825e310191f10a860f1dfbb74b835c2fa.tar.gz
document sv_gets() internals
sv_gets() contains some very dense code. while trying to rewrite I learned a lot about it worked, and Nicholas asked me to update the comments. So here they are. Hope this saves someone some heartache trying to understand this code.
Diffstat (limited to 'sv.c')
-rw-r--r--sv.c126
1 files changed, 118 insertions, 8 deletions
diff --git a/sv.c b/sv.c
index 93fcdac169..66d3bf6a3e 100644
--- a/sv.c
+++ b/sv.c
@@ -8058,6 +8058,7 @@ Perl_sv_gets(pTHX_ SV *const sv, PerlIO *const fp, I32 append)
SvUPGRADE(sv, SVt_PV);
if (append) {
+ /* line is going to be appended to the existing buffer in the sv */
if (PerlIO_isutf8(fp)) {
if (!SvUTF8(sv)) {
sv_utf8_upgrade_nomg(sv);
@@ -8070,6 +8071,8 @@ Perl_sv_gets(pTHX_ SV *const sv, PerlIO *const fp, I32 append)
SvPOK_only(sv);
if (!append) {
+ /* not appending - "clear" the string by setting SvCUR to 0,
+ * the pv is still avaiable. */
SvCUR_set(sv,0);
}
if (PerlIO_isutf8(fp))
@@ -8121,10 +8124,14 @@ Perl_sv_gets(pTHX_ SV *const sv, PerlIO *const fp, I32 append)
Perl_croak(aTHX_ "Wide character in $/");
}
}
+ /* extract the raw pointer to the record separator */
rsptr = SvPV_const(PL_rs, rslen);
}
}
+ /* rslast is the last character in the record separator
+ * note we don't use rslast except when rslen is true, so the
+ * null assign is a placeholder. */
rslast = rslen ? rsptr[rslen - 1] : '\0';
if (rspara) { /* have to do this both before and after */
@@ -8150,14 +8157,28 @@ Perl_sv_gets(pTHX_ SV *const sv, PerlIO *const fp, I32 append)
*/
if (PerlIO_fast_gets(fp)) {
+ /*
+ * We can do buffer based IO operations on this filehandle.
+ *
+ * This means we can bypass a lot of subcalls and process
+ * the buffer directly, it also means we know the upper bound
+ * on the amount of data we might read of the current buffer
+ * into our sv. Knowing this allows us to preallocate the pv
+ * to be able to hold that maximum, which allows us to simplify
+ * a lot of logic. /*
/*
* We're going to steal some values from the stdio struct
* and put EVERYTHING in the innermost loop into registers.
*/
- STDCHAR *ptr;
- STRLEN bpx;
- I32 shortbuffered;
+ STDCHAR *ptr; /* pointer into fp's read-ahead buffer */
+ STRLEN bpx; /* length of the data in the target sv
+ used to fix pointers after a SvGROW */
+ I32 shortbuffered; /* If the pv buffer is shorter than the amount
+ of data left in the read-ahead buffer.
+ If 0 then the pv buffer can hold the full
+ amount left, otherwise this is the amount it
+ can hold. */
#if defined(VMS) && defined(PERLIO_IS_STDIO)
/* An ungetc()d char is handled separately from the regular
@@ -8171,7 +8192,64 @@ Perl_sv_gets(pTHX_ SV *const sv, PerlIO *const fp, I32 append)
/* Here is some breathtakingly efficient cheating */
- cnt = PerlIO_get_cnt(fp); /* get count into register */
+ /* When you read the following logic resist the urge to think
+ * of record separators that are 1 byte long. They are an
+ * uninteresting special (simple) case.
+ *
+ * Instead think of record separators which are at least 2 bytes
+ * long, and keep in mind that we need to deal with such
+ * separators when they cross a read-ahead buffer boundary.
+ *
+ * Also consider that we need to gracefully deal with separators
+ * that may be longer than a single read ahead buffer.
+ *
+ * Lastly do not forget we want to copy the delimiter as well. We
+ * are copying all data in the file _up_to_and_including_ the separator
+ * itself.
+ *
+ * Now that you have all that in mind here is what is happening below:
+ *
+ * 1. When we first enter the loop we do some memory book keeping to see
+ * how much free space there is in the target SV. (This sub assumes that
+ * it is operating on the same SV most of the time via $_ and that it is
+ * going to be able to reuse the same pv buffer each call.) If there is
+ * "enough" room then we set "shortbuffered" to how much space there is
+ * and start reading forward.
+ *
+ * 2. When we scan forward we copy from the read-ahead buffer to the target
+ * SV's pv buffer. While we go we watch for the end of the read-ahead buffer,
+ * and the end of the of pv, as well as for the "rslast", which is the last
+ * char of the separator.
+ *
+ * 3. When scanning forward if we see rslast then we jump backwards in *pv*
+ * (which has a "complete" record up to the point we saw rslast) and check
+ * it to see if it matches the separator. If it does we are done. If it doesn't
+ * we continue on with the scan/copy.
+ *
+ * 4. If we run out of read-ahead buffer (cnt goes to 0) then we have to get
+ * the IO system to read the next buffer. We do this by doing a getc(), which
+ * returns a single char read (or EOF), and prefills the buffer, and also
+ * allows us to find out how full the buffer is. We use this information to
+ * SvGROW() the sv to the size remaining in the buffer, after which we copy
+ * the returned single char into the target sv, and then go back into scan
+ * forward mode.
+ *
+ * 5. If we run out of write-buffer then we SvGROW() it by the size of the
+ * remaining space in the read-buffer.
+ *
+ * Note that this code despite its twisty-turny nature is pretty darn slick.
+ * It manages single byte separators, multi-byte cross boundary separators,
+ * and cross-read-buffer separators cleanly and efficiently at the cost
+ * of potentially greatly overallocating the target SV.
+ *
+ * Yves
+ */
+
+
+ /* get the number of bytes remaining in the read-ahead buffer
+ * on first call on a given fp this will return 0.*/
+ cnt = PerlIO_get_cnt(fp);
+
/* make sure we have the room */
if ((I32)(SvLEN(sv) - append) <= cnt + 1) {
/* Not room for all of it
@@ -8183,15 +8261,24 @@ Perl_sv_gets(pTHX_ SV *const sv, PerlIO *const fp, I32 append)
cnt -= shortbuffered;
}
else {
+ /* ensure that the target sv has enough room to hold
+ * the rest of the read-ahead buffer */
shortbuffered = 0;
/* remember that cnt can be negative */
SvGROW(sv, (STRLEN)(append + (cnt <= 0 ? 2 : (cnt + 1))));
}
}
- else
+ else {
+ /* we have enough room to hold the full buffer, lets scream */
shortbuffered = 0;
+ }
+
+ /* extract the pointer to sv's string buffer, offset by append as necessary */
bp = (STDCHAR*)SvPVX_const(sv) + append; /* move these two too to registers */
+ /* extract the point to the read-ahead buffer */
ptr = (STDCHAR*)PerlIO_get_ptr(fp);
+
+ /* some trace debug output */
DEBUG_P(PerlIO_printf(Perl_debug_log,
"Screamer: entering, ptr=%"UVuf", cnt=%ld\n",PTR2UV(ptr),(long)cnt));
DEBUG_P(PerlIO_printf(Perl_debug_log,
@@ -8199,17 +8286,23 @@ Perl_sv_gets(pTHX_ SV *const sv, PerlIO *const fp, I32 append)
UVuf"\n",
PTR2UV(PerlIO_get_ptr(fp)), PerlIO_get_cnt(fp),
PTR2UV(PerlIO_has_base(fp) ? PerlIO_get_base(fp) : 0)));
+
for (;;) {
screamer:
+ /* if there is stuff left in the read-ahead buffer */
if (cnt > 0) {
+ /* if there is a separator */
if (rslen) {
+ /* loop until we hit the end of the read-ahead buffer */
while (cnt > 0) { /* this | eat */
+ /* scan forward copying and searching for rslast as we go */
cnt--;
if ((*bp++ = *ptr++) == rslast) /* really | dust */
goto thats_all_folks; /* screams | sed :-) */
}
}
else {
+ /* no separator, slurp the full buffer */
Copy(ptr, bp, cnt, char); /* this | eat */
bp += cnt; /* screams | dust */
ptr += cnt; /* louder | sed :-) */
@@ -8220,16 +8313,21 @@ Perl_sv_gets(pTHX_ SV *const sv, PerlIO *const fp, I32 append)
}
if (shortbuffered) { /* oh well, must extend */
+ /* we didnt have enough room to fit the line into the target buffer
+ * so we must extend the target buffer and keep going */
cnt = shortbuffered;
shortbuffered = 0;
bpx = bp - (STDCHAR*)SvPVX_const(sv); /* box up before relocation */
SvCUR_set(sv, bpx);
+ /* extned the target sv's buffer so it can hold the full read-ahead buffer */
SvGROW(sv, SvLEN(sv) + append + cnt + 2);
bp = (STDCHAR*)SvPVX_const(sv) + bpx; /* unbox after relocation */
continue;
}
cannot_be_shortbuffered:
+ /* we need to refill the read-ahead buffer if possible */
+
DEBUG_P(PerlIO_printf(Perl_debug_log,
"Screamer: going to getc, ptr=%"UVuf", cnt=%zd\n",
PTR2UV(ptr),cnt));
@@ -8240,9 +8338,15 @@ Perl_sv_gets(pTHX_ SV *const sv, PerlIO *const fp, I32 append)
PTR2UV(PerlIO_get_ptr(fp)), PerlIO_get_cnt(fp),
PTR2UV(PerlIO_has_base (fp) ? PerlIO_get_base(fp) : 0)));
- /* This used to call 'filbuf' in stdio form, but as that behaves like
- getc when cnt <= 0 we use PerlIO_getc here to avoid introducing
- another abstraction. */
+ /*
+ call PerlIO_getc() to let it prefill the lookahead buffer
+
+ This used to call 'filbuf' in stdio form, but as that behaves like
+ getc when cnt <= 0 we use PerlIO_getc here to avoid introducing
+ another abstraction.
+
+ Note we have to deal with the char in 'i' if we are not at EOF
+ */
i = PerlIO_getc(fp); /* get more characters */
DEBUG_Pv(PerlIO_printf(Perl_debug_log,
@@ -8250,6 +8354,7 @@ Perl_sv_gets(pTHX_ SV *const sv, PerlIO *const fp, I32 append)
PTR2UV(PerlIO_get_ptr(fp)), PerlIO_get_cnt(fp),
PTR2UV(PerlIO_has_base (fp) ? PerlIO_get_base(fp) : 0)));
+ /* find out how much is left in the read-ahead buffer, and rextract its pointer */
cnt = PerlIO_get_cnt(fp);
ptr = (STDCHAR*)PerlIO_get_ptr(fp); /* reregisterize cnt and ptr */
DEBUG_P(PerlIO_printf(Perl_debug_log,
@@ -8259,18 +8364,23 @@ Perl_sv_gets(pTHX_ SV *const sv, PerlIO *const fp, I32 append)
if (i == EOF) /* all done for ever? */
goto thats_really_all_folks;
+ /* make sure we have enough space in the target sv */
bpx = bp - (STDCHAR*)SvPVX_const(sv); /* box up before relocation */
SvCUR_set(sv, bpx);
SvGROW(sv, bpx + cnt + 2);
bp = (STDCHAR*)SvPVX_const(sv) + bpx; /* unbox after relocation */
+ /* copy of the char we got from getc() */
*bp++ = (STDCHAR)i; /* store character from PerlIO_getc */
+ /* make sure we deal with the i being the last character of a separator */
if (rslen && (STDCHAR)i == rslast) /* all done for now? */
goto thats_all_folks;
}
thats_all_folks:
+ /* check if we have actually found the separator - only really applies
+ * when rslen > 1 */
if ((rslen > 1 && (STRLEN)(bp - (STDCHAR*)SvPVX_const(sv)) < rslen) ||
memNE((char*)bp - rslen, rsptr, rslen))
goto screamer; /* go back to the fray */