summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordmbelov <dmbelov@gmail.com>2023-01-01 07:38:57 -0500
committerGitHub <noreply@github.com>2023-01-01 13:38:57 +0100
commit6d474f2dfe5e4b7ea2a6e7423a638316fa186227 (patch)
treef7b19ec9919f95433e6ee41ebb596d0df28ecd4c
parent77f15776e1d6d1357cd8c45dd297da821e18e0c2 (diff)
downloadnumpy-6d474f2dfe5e4b7ea2a6e7423a638316fa186227.tar.gz
BUG: np.loadtxt cannot load text file with quoted fields separated by whitespace (#22906)
Fix issue with `delimiter=None` and quote character not working properly (not using whitespace delimiter mode). Closes gh-22899
-rw-r--r--numpy/core/src/multiarray/textreading/tokenize.cpp3
-rw-r--r--numpy/lib/npyio.py8
-rw-r--r--numpy/lib/tests/test_loadtxt.py14
3 files changed, 24 insertions, 1 deletions
diff --git a/numpy/core/src/multiarray/textreading/tokenize.cpp b/numpy/core/src/multiarray/textreading/tokenize.cpp
index ff7e7a8c1..cc5e621d6 100644
--- a/numpy/core/src/multiarray/textreading/tokenize.cpp
+++ b/numpy/core/src/multiarray/textreading/tokenize.cpp
@@ -223,7 +223,8 @@ tokenizer_core(tokenizer_state *ts, parser_config *const config)
}
else {
/* continue parsing as if unquoted */
- ts->state = TOKENIZE_UNQUOTED;
+ /* Set to TOKENIZE_UNQUOTED or TOKENIZE_UNQUOTED_WHITESPACE */
+ ts->state = ts->unquoted_state;
}
break;
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index 4a27c7898..71d600c30 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -1303,6 +1303,14 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
array([('alpha, #42', 10.), ('beta, #64', 2.)],
dtype=[('label', '<U12'), ('value', '<f8')])
+ Quoted fields can be separated by multiple whitespace characters:
+
+ >>> s = StringIO('"alpha, #42" 10.0\n"beta, #64" 2.0\n')
+ >>> dtype = np.dtype([("label", "U12"), ("value", float)])
+ >>> np.loadtxt(s, dtype=dtype, delimiter=None, quotechar='"')
+ array([('alpha, #42', 10.), ('beta, #64', 2.)],
+ dtype=[('label', '<U12'), ('value', '<f8')])
+
Two consecutive quote characters within a quoted field are treated as a
single escaped character:
diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py
index 819a8dda4..8a5b044b8 100644
--- a/numpy/lib/tests/test_loadtxt.py
+++ b/numpy/lib/tests/test_loadtxt.py
@@ -534,6 +534,20 @@ def test_quoted_field(q):
assert_array_equal(res, expected)
+@pytest.mark.parametrize("q", ('"', "'", "`"))
+def test_quoted_field_with_whitepace_delimiter(q):
+ txt = StringIO(
+ f"{q}alpha, x{q} 2.5\n{q}beta, y{q} 4.5\n{q}gamma, z{q} 5.0\n"
+ )
+ dtype = np.dtype([('f0', 'U8'), ('f1', np.float64)])
+ expected = np.array(
+ [("alpha, x", 2.5), ("beta, y", 4.5), ("gamma, z", 5.0)], dtype=dtype
+ )
+
+ res = np.loadtxt(txt, dtype=dtype, delimiter=None, quotechar=q)
+ assert_array_equal(res, expected)
+
+
def test_quote_support_default():
"""Support for quoted fields is disabled by default."""
txt = StringIO('"lat,long", 45, 30\n')