diff options
author | Sebastian Berg <sebastian@sipsolutions.net> | 2022-01-06 18:37:07 -0600 |
---|---|---|
committer | Sebastian Berg <sebastian@sipsolutions.net> | 2022-01-14 20:07:07 -0600 |
commit | 6bf1b2110157a58b5e909a169e7f495ce9e91b09 (patch) | |
tree | 3a29b3a739dc31828aae3907936eb599f1bf11d3 /numpy | |
parent | 0a636c4faf2826a13f25566668ee8649081b80d1 (diff) | |
download | numpy-6bf1b2110157a58b5e909a169e7f495ce9e91b09.tar.gz |
BUG: Fix skiprows handling and simplify lineskipping logic
Somewhere, the skiprow handling got broken, this fixes it. It also
simplifies the "forward to end of line" logic slightly (and hopefully
correctly). This logic is used when a comment is reached or for
skipping lines before any actual data is read.
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/core/src/multiarray/textreading/stream.h | 5 | ||||
-rw-r--r-- | numpy/core/src/multiarray/textreading/tokenize.c.src | 16 |
2 files changed, 13 insertions, 8 deletions
diff --git a/numpy/core/src/multiarray/textreading/stream.h b/numpy/core/src/multiarray/textreading/stream.h index 0c4567329..b2fb1e1bf 100644 --- a/numpy/core/src/multiarray/textreading/stream.h +++ b/numpy/core/src/multiarray/textreading/stream.h @@ -9,9 +9,8 @@ * we definitely expect to get line-by-line buffers. */ #define BUFFER_MAY_CONTAIN_NEWLINE 0 -#define BUFFER_IS_PARTIAL_LINE 1 -#define BUFFER_IS_LINEND 2 -#define BUFFER_IS_FILEEND 3 +#define BUFFER_IS_LINEND 1 +#define BUFFER_IS_FILEEND 2 typedef struct _stream { void *stream_data; diff --git a/numpy/core/src/multiarray/textreading/tokenize.c.src b/numpy/core/src/multiarray/textreading/tokenize.c.src index ed68749d1..10475b921 100644 --- a/numpy/core/src/multiarray/textreading/tokenize.c.src +++ b/numpy/core/src/multiarray/textreading/tokenize.c.src @@ -249,6 +249,8 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config) if (ts->buf_state != BUFFER_MAY_CONTAIN_NEWLINE) { pos = stop; /* advance to next buffer */ ts->state = TOKENIZE_LINE_END; + /* Ensure we don't think we have an empty line left to parse: */ + ts->buf_state = BUFFER_MAY_CONTAIN_NEWLINE; break; } for (; pos < stop; pos++) { @@ -322,16 +324,20 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config) if (NPY_UNLIKELY(ts->pos >= ts->end)) { if (ts->buf_state == BUFFER_IS_LINEND && - ts->state != TOKENIZE_QUOTED && - ts->state != TOKENIZE_CHECK_QUOTED) { + ts->state != TOKENIZE_QUOTED) { /* * Finished line, do not read anymore (also do not eat \n). * If we are in a quoted field and the "line" does not end with * a newline, the quoted field will be missing it right now. - * TODO: We should probably just insert a "\n" character here, - * which is also closer to what the python code did - * (either by setting pos/end or manually). + * (i.e. `np.loadtxt(['"a', 'b"'], dtype="S2")` reads "ab") + * TODO: We should possibly insert a '\n' character when inside + * a quoted field the and '\n' character is not included + * in the string. `FileLike.readline()` does ensure it + * is included. + * + * Ensure we don't think we have an empty line left to parse: */ + ts->buf_state = BUFFER_MAY_CONTAIN_NEWLINE; goto finish; } /* fetch new data */ |