summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
authorSebastian Berg <sebastian@sipsolutions.net>2022-01-06 18:37:07 -0600
committerSebastian Berg <sebastian@sipsolutions.net>2022-01-14 20:07:07 -0600
commit6bf1b2110157a58b5e909a169e7f495ce9e91b09 (patch)
tree3a29b3a739dc31828aae3907936eb599f1bf11d3 /numpy
parent0a636c4faf2826a13f25566668ee8649081b80d1 (diff)
downloadnumpy-6bf1b2110157a58b5e909a169e7f495ce9e91b09.tar.gz
BUG: Fix skiprows handling and simplify lineskipping logic
Somewhere, the skiprow handling got broken, this fixes it. It also simplifies the "forward to end of line" logic slightly (and hopefully correctly). This logic is used when a comment is reached or for skipping lines before any actual data is read.
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/src/multiarray/textreading/stream.h5
-rw-r--r--numpy/core/src/multiarray/textreading/tokenize.c.src16
2 files changed, 13 insertions, 8 deletions
diff --git a/numpy/core/src/multiarray/textreading/stream.h b/numpy/core/src/multiarray/textreading/stream.h
index 0c4567329..b2fb1e1bf 100644
--- a/numpy/core/src/multiarray/textreading/stream.h
+++ b/numpy/core/src/multiarray/textreading/stream.h
@@ -9,9 +9,8 @@
* we definitely expect to get line-by-line buffers.
*/
#define BUFFER_MAY_CONTAIN_NEWLINE 0
-#define BUFFER_IS_PARTIAL_LINE 1
-#define BUFFER_IS_LINEND 2
-#define BUFFER_IS_FILEEND 3
+#define BUFFER_IS_LINEND 1
+#define BUFFER_IS_FILEEND 2
typedef struct _stream {
void *stream_data;
diff --git a/numpy/core/src/multiarray/textreading/tokenize.c.src b/numpy/core/src/multiarray/textreading/tokenize.c.src
index ed68749d1..10475b921 100644
--- a/numpy/core/src/multiarray/textreading/tokenize.c.src
+++ b/numpy/core/src/multiarray/textreading/tokenize.c.src
@@ -249,6 +249,8 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config)
if (ts->buf_state != BUFFER_MAY_CONTAIN_NEWLINE) {
pos = stop; /* advance to next buffer */
ts->state = TOKENIZE_LINE_END;
+ /* Ensure we don't think we have an empty line left to parse: */
+ ts->buf_state = BUFFER_MAY_CONTAIN_NEWLINE;
break;
}
for (; pos < stop; pos++) {
@@ -322,16 +324,20 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config)
if (NPY_UNLIKELY(ts->pos >= ts->end)) {
if (ts->buf_state == BUFFER_IS_LINEND &&
- ts->state != TOKENIZE_QUOTED &&
- ts->state != TOKENIZE_CHECK_QUOTED) {
+ ts->state != TOKENIZE_QUOTED) {
/*
* Finished line, do not read anymore (also do not eat \n).
* If we are in a quoted field and the "line" does not end with
* a newline, the quoted field will be missing it right now.
- * TODO: We should probably just insert a "\n" character here,
- * which is also closer to what the python code did
- * (either by setting pos/end or manually).
+ * (i.e. `np.loadtxt(['"a', 'b"'], dtype="S2")` reads "ab")
+ * TODO: We should possibly insert a '\n' character when inside
+ * a quoted field the and '\n' character is not included
+ * in the string. `FileLike.readline()` does ensure it
+ * is included.
+ *
+ * Ensure we don't think we have an empty line left to parse:
*/
+ ts->buf_state = BUFFER_MAY_CONTAIN_NEWLINE;
goto finish;
}
/* fetch new data */