BUG: Fix skiprows handling and simplify lineskipping logic

Somewhere, the skiprow handling got broken, this fixes it. It also simplifies the "forward to end of line" logic slightly (and hopefully correctly). This logic is used when a comment is reached or for skipping lines before any actual data is read.
author: Sebastian Berg <sebastian@sipsolutions.net> 2022-01-06 18:37:07 -0600
committer: Sebastian Berg <sebastian@sipsolutions.net> 2022-01-14 20:07:07 -0600
commit: 6bf1b2110157a58b5e909a169e7f495ce9e91b09 (patch)
tree: 3a29b3a739dc31828aae3907936eb599f1bf11d3 /numpy
parent: 0a636c4faf2826a13f25566668ee8649081b80d1 (diff)
download: numpy-6bf1b2110157a58b5e909a169e7f495ce9e91b09.tar.gz
2 files changed, 13 insertions, 8 deletions
diff --git a/numpy/core/src/multiarray/textreading/stream.h b/numpy/core/src/multiarray/textreading/stream.h
index 0c4567329..b2fb1e1bf 100644
--- a/numpy/core/src/multiarray/textreading/stream.h
+++ b/numpy/core/src/multiarray/textreading/stream.h
@@ -9,9 +9,8 @@
  * we definitely expect to get line-by-line buffers.
  */
 #define BUFFER_MAY_CONTAIN_NEWLINE 0
-#define BUFFER_IS_PARTIAL_LINE 1
-#define BUFFER_IS_LINEND 2
-#define BUFFER_IS_FILEEND 3
+#define BUFFER_IS_LINEND 1
+#define BUFFER_IS_FILEEND 2
 
 typedef struct _stream {
     void *stream_data;
diff --git a/numpy/core/src/multiarray/textreading/tokenize.c.src b/numpy/core/src/multiarray/textreading/tokenize.c.src
index ed68749d1..10475b921 100644
--- a/numpy/core/src/multiarray/textreading/tokenize.c.src
+++ b/numpy/core/src/multiarray/textreading/tokenize.c.src
@@ -249,6 +249,8 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config)
             if (ts->buf_state != BUFFER_MAY_CONTAIN_NEWLINE) {
                 pos = stop;  /* advance to next buffer */
                 ts->state = TOKENIZE_LINE_END;
+                /* Ensure we don't think we have an empty line left to parse: */
+                ts->buf_state = BUFFER_MAY_CONTAIN_NEWLINE;
                 break;
             }
             for (; pos < stop; pos++) {
@@ -322,16 +324,20 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config)
 
         if (NPY_UNLIKELY(ts->pos >= ts->end)) {
             if (ts->buf_state == BUFFER_IS_LINEND &&
-                    ts->state != TOKENIZE_QUOTED &&
-                    ts->state != TOKENIZE_CHECK_QUOTED) {
+                    ts->state != TOKENIZE_QUOTED) {
                 /*
                  * Finished line, do not read anymore (also do not eat \n).
                  * If we are in a quoted field and the "line" does not end with
                  * a newline, the quoted field will be missing it right now.
-                 * TODO: We should probably just insert a "\n" character here,
-                 *       which is also  closer to what the python code did
-                 *       (either by setting pos/end or manually).
+                 * (i.e. `np.loadtxt(['"a', 'b"'], dtype="S2")` reads "ab")
+                 * TODO: We should possibly insert a '\n' character when inside
+                 *       a quoted field the and '\n' character is not included
+                 *       in the string. `FileLike.readline()` does ensure it
+                 *       is included.
+                 *
+                 * Ensure we don't think we have an empty line left to parse:
                  */
+                ts->buf_state = BUFFER_MAY_CONTAIN_NEWLINE;
                 goto finish;
             }
             /* fetch new data */
author	Sebastian Berg <sebastian@sipsolutions.net>	2022-01-06 18:37:07 -0600
committer	Sebastian Berg <sebastian@sipsolutions.net>	2022-01-14 20:07:07 -0600
commit	6bf1b2110157a58b5e909a169e7f495ce9e91b09 (patch)
tree	3a29b3a739dc31828aae3907936eb599f1bf11d3 /numpy
parent	0a636c4faf2826a13f25566668ee8649081b80d1 (diff)
download	numpy-6bf1b2110157a58b5e909a169e7f495ce9e91b09.tar.gz