summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--numpy/core/setup.py9
-rw-r--r--numpy/core/src/multiarray/conversion_utils.c11
-rw-r--r--numpy/core/src/multiarray/conversion_utils.h3
-rw-r--r--numpy/core/src/multiarray/multiarraymodule.c3
-rw-r--r--numpy/core/src/multiarray/textreading/conversions.c375
-rw-r--r--numpy/core/src/multiarray/textreading/conversions.h57
-rw-r--r--numpy/core/src/multiarray/textreading/field_types.c200
-rw-r--r--numpy/core/src/multiarray/textreading/field_types.h49
-rw-r--r--numpy/core/src/multiarray/textreading/growth.c38
-rw-r--r--numpy/core/src/multiarray/textreading/growth.h7
-rw-r--r--numpy/core/src/multiarray/textreading/parser_config.h77
-rw-r--r--numpy/core/src/multiarray/textreading/readtext.c199
-rw-r--r--numpy/core/src/multiarray/textreading/readtext.h7
-rw-r--r--numpy/core/src/multiarray/textreading/rows.c438
-rw-r--r--numpy/core/src/multiarray/textreading/rows.h22
-rw-r--r--numpy/core/src/multiarray/textreading/str_to_int.c87
-rw-r--r--numpy/core/src/multiarray/textreading/str_to_int.h175
-rw-r--r--numpy/core/src/multiarray/textreading/stream.h29
-rw-r--r--numpy/core/src/multiarray/textreading/stream_pyobject.c271
-rw-r--r--numpy/core/src/multiarray/textreading/stream_pyobject.h16
-rw-r--r--numpy/core/src/multiarray/textreading/tokenize.c.src449
-rw-r--r--numpy/core/src/multiarray/textreading/tokenize.h77
-rw-r--r--numpy/lib/npyio.py634
23 files changed, 2922 insertions, 311 deletions
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 22cac1e9a..3d7e958d3 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -868,6 +868,7 @@ def configuration(parent_package='',top_path=None):
join('src', 'multiarray', 'typeinfo.h'),
join('src', 'multiarray', 'usertypes.h'),
join('src', 'multiarray', 'vdot.h'),
+ join('src', 'multiarray', 'textreading', 'readtext.h'),
join('include', 'numpy', 'arrayobject.h'),
join('include', 'numpy', '_neighborhood_iterator_imp.h'),
join('include', 'numpy', 'npy_endian.h'),
@@ -955,6 +956,14 @@ def configuration(parent_package='',top_path=None):
join('src', 'npysort', 'selection.c.src'),
join('src', 'common', 'npy_binsearch.h'),
join('src', 'npysort', 'binsearch.cpp'),
+ join('src', 'multiarray', 'textreading', 'conversions.c'),
+ join('src', 'multiarray', 'textreading', 'field_types.c'),
+ join('src', 'multiarray', 'textreading', 'growth.c'),
+ join('src', 'multiarray', 'textreading', 'readtext.c'),
+ join('src', 'multiarray', 'textreading', 'rows.c'),
+ join('src', 'multiarray', 'textreading', 'stream_pyobject.c'),
+ join('src', 'multiarray', 'textreading', 'str_to_int.c'),
+ join('src', 'multiarray', 'textreading', 'tokenize.c.src'),
]
#######################################################################
diff --git a/numpy/core/src/multiarray/conversion_utils.c b/numpy/core/src/multiarray/conversion_utils.c
index a1de580d9..e4eb4f49e 100644
--- a/numpy/core/src/multiarray/conversion_utils.c
+++ b/numpy/core/src/multiarray/conversion_utils.c
@@ -993,6 +993,17 @@ PyArray_PyIntAsIntp(PyObject *o)
}
+NPY_NO_EXPORT int
+PyArray_IntpFromPyIntConverter(PyObject *o, npy_intp *val)
+{
+ *val = PyArray_PyIntAsIntp(o);
+ if (error_converting(*val)) {
+ return NPY_FAIL;
+ }
+ return NPY_SUCCEED;
+}
+
+
/*
* PyArray_IntpFromIndexSequence
* Returns the number of dimensions or -1 if an error occurred.
diff --git a/numpy/core/src/multiarray/conversion_utils.h b/numpy/core/src/multiarray/conversion_utils.h
index 4072841ee..4d0fbb894 100644
--- a/numpy/core/src/multiarray/conversion_utils.h
+++ b/numpy/core/src/multiarray/conversion_utils.h
@@ -7,6 +7,9 @@ NPY_NO_EXPORT int
PyArray_IntpConverter(PyObject *obj, PyArray_Dims *seq);
NPY_NO_EXPORT int
+PyArray_IntpFromPyIntConverter(PyObject *o, npy_intp *val);
+
+NPY_NO_EXPORT int
PyArray_OptionalIntpConverter(PyObject *obj, PyArray_Dims *seq);
typedef enum {
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 789446d0c..a7b6898e1 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -69,6 +69,7 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
#include "get_attr_string.h"
#include "experimental_public_dtype_api.h" /* _get_experimental_dtype_api */
+#include "textreading/readtext.h" /* _readtext_from_file_object */
#include "npy_dlpack.h"
@@ -4456,6 +4457,8 @@ static struct PyMethodDef array_module_methods[] = {
METH_VARARGS | METH_KEYWORDS, NULL},
{"_get_experimental_dtype_api", (PyCFunction)_get_experimental_dtype_api,
METH_O, NULL},
+ {"_load_from_filelike", (PyCFunction)_load_from_filelike,
+ METH_FASTCALL | METH_KEYWORDS, NULL},
/* from umath */
{"frompyfunc",
(PyCFunction) ufunc_frompyfunc,
diff --git a/numpy/core/src/multiarray/textreading/conversions.c b/numpy/core/src/multiarray/textreading/conversions.c
new file mode 100644
index 000000000..be697c380
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/conversions.c
@@ -0,0 +1,375 @@
+
+#include <Python.h>
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
+#include "conversions.h"
+#include "str_to_int.h"
+
+#include "array_coercion.h"
+
+
+/*
+ * Coercion to boolean is done via integer right now.
+ */
+int
+to_bool(PyArray_Descr *NPY_UNUSED(descr),
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *NPY_UNUSED(pconfig))
+{
+ int64_t res;
+ if (str_to_int64(str, end, INT64_MIN, INT64_MAX, &res) < 0) {
+ return -1;
+ }
+ *dataptr = (char)(res != 0);
+ return 0;
+}
+
+
+/*
+ * In order to not pack a whole copy of a floating point parser, we copy the
+ * result into ascii and call the Python one. Float parsing isn't super quick
+ * so this is not terrible, but avoiding it would speed up things.
+ *
+ * Also note that parsing the first float of a complex will copy the whole
+ * string to ascii rather than just the first part.
+ * TODO: A tweak of the break might be a simple mitigation there.
+ *
+ * @param str The UCS4 string to parse
+ * @param end Pointer to the end of the string
+ * @param skip_trailing_whitespace If false does not skip trailing whitespace
+ * (used by the complex parser).
+ * @param result Output stored as double value.
+ */
+static NPY_INLINE int
+double_from_ucs4(
+ const Py_UCS4 *str, const Py_UCS4 *end,
+ bool skip_trailing_whitespace, double *result, const Py_UCS4 **p_end)
+{
+ /* skip leading whitespace */
+ while (Py_UNICODE_ISSPACE(*str)) {
+ str++;
+ }
+ if (str == end) {
+ return -1; /* empty or only whitespace: not a floating point number */
+ }
+
+ /* We convert to ASCII for the Python parser, use stack if small: */
+ char stack_buf[128];
+ char *heap_buf = NULL;
+ char *ascii = stack_buf;
+
+ size_t str_len = end - str;
+ if (str_len > 128) {
+ heap_buf = PyMem_MALLOC(str_len);
+ ascii = heap_buf;
+ }
+ char *c = ascii;
+ for (; str < end; str++, c++) {
+ if (NPY_UNLIKELY(*str >= 128)) {
+ break; /* the following cannot be a number anymore */
+ }
+ *c = (char)(*str);
+ }
+ *c = '\0';
+
+ char *end_parsed;
+ *result = PyOS_string_to_double(ascii, &end_parsed, NULL);
+ /* Rewind `end` to the first UCS4 character not parsed: */
+ end = end - (c - end_parsed);
+
+ PyMem_FREE(heap_buf);
+
+ if (*result == -1. && PyErr_Occurred()) {
+ return -1;
+ }
+
+ if (skip_trailing_whitespace) {
+ /* and then skip any remainig whitespace: */
+ while (Py_UNICODE_ISSPACE(*end)) {
+ end++;
+ }
+ }
+ *p_end = end;
+ return 0;
+}
+
+/*
+ * `item` must be the nul-terminated string that is to be
+ * converted to a double.
+ *
+ * To be successful, to_double() must use *all* the characters
+ * in `item`. E.g. "1.q25" will fail. Leading and trailing
+ * spaces are allowed.
+ */
+int
+to_float(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *NPY_UNUSED(pconfig))
+{
+ double double_val;
+ const Py_UCS4 *p_end;
+ if (double_from_ucs4(str, end, true, &double_val, &p_end) < 0) {
+ return -1;
+ }
+ if (p_end != end) {
+ return -1;
+ }
+
+ float val = (float)double_val;
+ memcpy(dataptr, &val, sizeof(float));
+ if (!PyArray_ISNBO(descr->byteorder)) {
+ descr->f->copyswap(dataptr, dataptr, 1, NULL);
+ }
+ return 0;
+}
+
+
+int
+to_double(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *NPY_UNUSED(pconfig))
+{
+ double val;
+ const Py_UCS4 *p_end;
+ if (double_from_ucs4(str, end, true, &val, &p_end) < 0) {
+ return -1;
+ }
+ if (p_end != end) {
+ return -1;
+ }
+
+ memcpy(dataptr, &val, sizeof(double));
+ if (!PyArray_ISNBO(descr->byteorder)) {
+ descr->f->copyswap(dataptr, dataptr, 1, NULL);
+ }
+ return 0;
+}
+
+
+static bool
+to_complex_int(
+ const Py_UCS4 *item, const Py_UCS4 *token_end,
+ double *p_real, double *p_imag,
+ Py_UCS4 imaginary_unit, bool allow_parens)
+{
+ const Py_UCS4 *p_end;
+ bool unmatched_opening_paren = false;
+
+ /* Remove whitespace before the possibly leading '(' */
+ while (Py_UNICODE_ISSPACE(*item)) {
+ ++item;
+ }
+ if (allow_parens && (*item == '(')) {
+ unmatched_opening_paren = true;
+ ++item;
+ }
+ if (double_from_ucs4(item, token_end, false, p_real, &p_end) < 0) {
+ return false;
+ }
+ if (p_end == token_end) {
+ // No imaginary part in the string (e.g. "3.5")
+ *p_imag = 0.0;
+ return !unmatched_opening_paren;
+ }
+ if (*p_end == imaginary_unit) {
+ // Pure imaginary part only (e.g "1.5j")
+ *p_imag = *p_real;
+ *p_real = 0.0;
+ ++p_end;
+ if (unmatched_opening_paren && (*p_end == ')')) {
+ ++p_end;
+ unmatched_opening_paren = false;
+ }
+ }
+ else if (unmatched_opening_paren && (*p_end == ')')) {
+ *p_imag = 0.0;
+ ++p_end;
+ unmatched_opening_paren = false;
+ }
+ else {
+ if (*p_end == '+') {
+ ++p_end;
+ }
+ if (double_from_ucs4(p_end, token_end, false, p_imag, &p_end) < 0) {
+ return false;
+ }
+ if (*p_end != imaginary_unit) {
+ return false;
+ }
+ ++p_end;
+ if (unmatched_opening_paren && (*p_end == ')')) {
+ ++p_end;
+ unmatched_opening_paren = false;
+ }
+ }
+ while (Py_UNICODE_ISSPACE(*p_end)) {
+ ++p_end;
+ }
+ return p_end == token_end;
+}
+
+
+int
+to_cfloat(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *pconfig)
+{
+ double real;
+ double imag;
+
+ bool success = to_complex_int(
+ str, end, &real, &imag,
+ pconfig->imaginary_unit, true);
+
+ if (!success) {
+ return -1;
+ }
+ npy_complex64 val = {(float)real, (float)imag};
+ memcpy(dataptr, &val, sizeof(npy_complex64));
+ if (!PyArray_ISNBO(descr->byteorder)) {
+ descr->f->copyswap(dataptr, dataptr, 1, NULL);
+ }
+ return 0;
+}
+
+
+int
+to_cdouble(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *pconfig)
+{
+ double real;
+ double imag;
+
+ bool success = to_complex_int(
+ str, end, &real, &imag, pconfig->imaginary_unit, true);
+
+ if (!success) {
+ return -1;
+ }
+ npy_complex128 val = {real, imag};
+ memcpy(dataptr, &val, sizeof(npy_complex128));
+ if (!PyArray_ISNBO(descr->byteorder)) {
+ descr->f->copyswap(dataptr, dataptr, 1, NULL);
+ }
+ return 0;
+}
+
+
+/*
+ * String and unicode conversion functions.
+ */
+int
+to_string(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *NPY_UNUSED(unused))
+{
+ const Py_UCS4* c = str;
+ size_t length = descr->elsize;
+
+ for (size_t i = 0; i < length; i++) {
+ if (c < end) {
+ /*
+ * loadtxt assumed latin1, which is compatible with UCS1 (first
+ * 256 unicode characters).
+ */
+ if (NPY_UNLIKELY(*c > 255)) {
+ /* TODO: Was UnicodeDecodeError, is unspecific error good? */
+ return -1;
+ }
+ dataptr[i] = (Py_UCS1)(*c);
+ c++;
+ }
+ else {
+ dataptr[i] = '\0';
+ }
+ }
+ return 0;
+}
+
+
+int
+to_unicode(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *NPY_UNUSED(unused))
+{
+ size_t length = descr->elsize / 4;
+
+ if (length <= (size_t)(end - str)) {
+ memcpy(dataptr, str, length * 4);
+ }
+ else {
+ size_t given_len = end - str;
+ memcpy(dataptr, str, given_len * 4);
+ memset(dataptr + given_len * 4, '\0', (length -given_len) * 4);
+ }
+
+ if (!PyArray_ISNBO(descr->byteorder)) {
+ descr->f->copyswap(dataptr, dataptr, 1, NULL);
+ }
+ return 0;
+}
+
+
+
+/*
+ * Convert functions helper for the generic converter.
+ */
+static PyObject *
+call_converter_function(
+ PyObject *func, const Py_UCS4 *str, size_t length, bool byte_converters)
+{
+ PyObject *s = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, str, length);
+ if (s == NULL) {
+ return s;
+ }
+ if (byte_converters) {
+ Py_SETREF(s, PyUnicode_AsEncodedString(s, "latin1", NULL));
+ if (s == NULL) {
+ return NULL;
+ }
+ }
+ if (func == NULL) {
+ return s;
+ }
+ PyObject *result = PyObject_CallFunctionObjArgs(func, s, NULL);
+ Py_DECREF(s);
+ return result;
+}
+
+
+int
+to_generic_with_converter(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *config, PyObject *func)
+{
+ bool use_byte_converter;
+ if (func == NULL) {
+ use_byte_converter = config->c_byte_converters;
+ }
+ else {
+ use_byte_converter = config->python_byte_converters;
+ }
+ /* Converts to unicode and calls custom converter (if set) */
+ PyObject *converted = call_converter_function(
+ func, str, (size_t)(end - str), use_byte_converter);
+ if (converted == NULL) {
+ return -1;
+ }
+
+ int res = PyArray_Pack(descr, dataptr, converted);
+ Py_DECREF(converted);
+ return res;
+}
+
+
+int
+to_generic(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *config)
+{
+ return to_generic_with_converter(descr, str, end, dataptr, config, NULL);
+} \ No newline at end of file
diff --git a/numpy/core/src/multiarray/textreading/conversions.h b/numpy/core/src/multiarray/textreading/conversions.h
new file mode 100644
index 000000000..6308c10d4
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/conversions.h
@@ -0,0 +1,57 @@
+#ifndef CONVERSIONS_H
+#define CONVERSIONS_H
+
+#include <stdbool.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "numpy/arrayobject.h"
+
+#include "textreading/parser_config.h"
+
+int
+to_bool(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *pconfig);
+
+int
+to_float(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *pconfig);
+
+int
+to_double(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *pconfig);
+
+int
+to_cfloat(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *pconfig);
+
+int
+to_cdouble(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *pconfig);
+
+int
+to_string(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *unused);
+
+int
+to_unicode(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *unused);
+
+int
+to_generic_with_converter(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *unused, PyObject *func);
+
+int
+to_generic(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *pconfig);
+
+#endif
diff --git a/numpy/core/src/multiarray/textreading/field_types.c b/numpy/core/src/multiarray/textreading/field_types.c
new file mode 100644
index 000000000..914c8e4d8
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/field_types.c
@@ -0,0 +1,200 @@
+#include "field_types.h"
+#include "conversions.h"
+#include "str_to_int.h"
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "numpy/ndarraytypes.h"
+#include "alloc.h"
+
+#include "textreading/growth.h"
+
+
+void
+field_types_xclear(int num_field_types, field_type *ft) {
+ assert(num_field_types >= 0);
+ if (ft == NULL) {
+ return;
+ }
+ for (int i = 0; i < num_field_types; i++) {
+ Py_XDECREF(ft[i].descr);
+ ft[i].descr = NULL;
+ }
+ PyMem_Free(ft);
+}
+
+
+/*
+ * Fetch custom converters for the builtin NumPy DTypes (or the generic one).
+ * Structured DTypes get unpacked and `object` uses the generic method.
+ *
+ * TODO: This should probably be moved on the DType object in some form,
+ * to allow user DTypes to define their own converters.
+ */
+static set_from_ucs4_function *
+get_from_ucs4_function(PyArray_Descr *descr)
+{
+ if (descr->type_num == NPY_BOOL) {
+ return &to_bool;
+ }
+ else if (PyDataType_ISSIGNED(descr)) {
+ switch (descr->elsize) {
+ case 1:
+ return &to_int8;
+ case 2:
+ return &to_int16;
+ case 4:
+ return &to_int32;
+ case 8:
+ return &to_int64;
+ default:
+ assert(0);
+ }
+ }
+ else if (PyDataType_ISUNSIGNED(descr)) {
+ switch (descr->elsize) {
+ case 1:
+ return &to_uint8;
+ case 2:
+ return &to_uint16;
+ case 4:
+ return &to_uint32;
+ case 8:
+ return &to_uint64;
+ default:
+ assert(0);
+ }
+ }
+ else if (descr->type_num == NPY_FLOAT) {
+ return &to_float;
+ }
+ else if (descr->type_num == NPY_DOUBLE) {
+ return &to_double;
+ }
+ else if (descr->type_num == NPY_CFLOAT) {
+ return &to_cfloat;
+ }
+ else if (descr->type_num == NPY_CDOUBLE) {
+ return &to_cdouble;
+ }
+ else if (descr->type_num == NPY_STRING) {
+ return &to_string;
+ }
+ else if (descr->type_num == NPY_UNICODE) {
+ return &to_unicode;
+ }
+ return &to_generic;
+}
+
+
+/*
+ * Note that the function cleans up `ft` on error. If `num_field_types < 0`
+ * cleanup has already happened in the internal call.
+ */
+static npy_intp
+field_type_grow_recursive(PyArray_Descr *descr,
+ npy_intp num_field_types, field_type **ft, npy_intp *ft_size,
+ npy_intp field_offset)
+{
+ if (PyDataType_HASSUBARRAY(descr)) {
+ PyArray_Dims shape = {NULL, -1};
+
+ if (!(PyArray_IntpConverter(descr->subarray->shape, &shape))) {
+ PyErr_SetString(PyExc_ValueError, "invalid subarray shape");
+ field_types_xclear(num_field_types, *ft);
+ return -1;
+ }
+ npy_intp size = PyArray_MultiplyList(shape.ptr, shape.len);
+ npy_free_cache_dim_obj(shape);
+ for (npy_intp i = 0; i < size; i++) {
+ num_field_types = field_type_grow_recursive(descr->subarray->base,
+ num_field_types, ft, ft_size, field_offset);
+ field_offset += descr->subarray->base->elsize;
+ if (num_field_types < 0) {
+ return -1;
+ }
+ }
+ return num_field_types;
+ }
+ else if (PyDataType_HASFIELDS(descr)) {
+ npy_int num_descr_fields = PyTuple_Size(descr->names);
+ if (num_descr_fields < 0) {
+ field_types_xclear(num_field_types, *ft);
+ return -1;
+ }
+ for (npy_intp i = 0; i < num_descr_fields; i++) {
+ PyObject *key = PyTuple_GET_ITEM(descr->names, i);
+ PyObject *tup = PyObject_GetItem(descr->fields, key);
+ if (tup == NULL) {
+ field_types_xclear(num_field_types, *ft);
+ return -1;
+ }
+ PyArray_Descr *field_descr;
+ PyObject *title;
+ int offset;
+ if (!PyArg_ParseTuple(tup, "Oi|O", &field_descr, &offset, &title)) {
+ Py_DECREF(tup);
+ field_types_xclear(num_field_types, *ft);
+ return -1;
+ }
+ num_field_types = field_type_grow_recursive(
+ field_descr, num_field_types, ft, ft_size,
+ field_offset + offset);
+ if (num_field_types < 0) {
+ return -1;
+ }
+ }
+ return num_field_types;
+ }
+
+ if (*ft_size <= num_field_types) {
+ npy_intp alloc_size = grow_size_and_multiply(
+ ft_size, 4, sizeof(field_type));
+ if (alloc_size < 0) {
+ field_types_xclear(num_field_types, *ft);
+ return -1;
+ }
+ field_type *new_ft = PyMem_Realloc(*ft, alloc_size);
+ if (new_ft == NULL) {
+ field_types_xclear(num_field_types, *ft);
+ return -1;
+ }
+ *ft = new_ft;
+ }
+
+ Py_INCREF(descr);
+ (*ft)[num_field_types].descr = descr;
+ (*ft)[num_field_types].set_from_ucs4 = get_from_ucs4_function(descr);
+ (*ft)[num_field_types].structured_offset = field_offset;
+
+ return num_field_types + 1;
+}
+
+
+/*
+ * Prepare the "field_types" for the given dtypes/descriptors. Currently,
+ * we copy the itemsize, but the main thing is that we check for custom
+ * converters.
+ */
+npy_intp
+field_types_create(PyArray_Descr *descr, field_type **ft)
+{
+ if (descr->subarray != NULL) {
+ /*
+ * This could probably be allowed, but NumPy absorbs the dimensions
+ * so it is an awkward corner case that probably never really worked.
+ */
+ PyErr_SetString(PyExc_TypeError,
+ "file reader does not support subarray dtypes. You can"
+ "put the dtype into a structured one using "
+ "`np.dtype(('name', dtype))` to avoid this limitation.");
+ return -1;
+ }
+
+ npy_intp ft_size = 4;
+ *ft = PyMem_Malloc(ft_size * sizeof(field_type));
+ if (*ft == NULL) {
+ return -1;
+ }
+ return field_type_grow_recursive(descr, 0, ft, &ft_size, 0);
+}
diff --git a/numpy/core/src/multiarray/textreading/field_types.h b/numpy/core/src/multiarray/textreading/field_types.h
new file mode 100644
index 000000000..5c4cfb2c6
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/field_types.h
@@ -0,0 +1,49 @@
+
+#ifndef _FIELD_TYPES_H_
+#define _FIELD_TYPES_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "numpy/ndarraytypes.h"
+
+#include "textreading/parser_config.h"
+
+/*
+ * The original code had some error details, but I assume that we don't need
+ * it. Printing the string from which we tried to modify it should be fine.
+ * This should potentially be public NumPy API, although it is tricky, NumPy
+ *
+ * This function must support unaligned memory access.
+ *
+ * NOTE: An earlier version of the code had unused default versions (pandas
+ * does this) when columns are missing. We could define this either
+ * by passing `NULL` in, or by adding a default explicitly somewhere.
+ * (I think users should probably have to define the default, at which
+ * point it doesn't matter here.)
+ *
+ * NOTE: We are currently passing the parser config, this could be made public
+ * or could be set up to be dtype specific/private. Always passing
+ * pconfig fully seems easier right now even if it may change.
+ */
+typedef int (set_from_ucs4_function)(
+ PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end,
+ char *dataptr, parser_config *pconfig);
+
+typedef struct _field_type {
+ set_from_ucs4_function *set_from_ucs4;
+ /* The original NumPy descriptor */
+ PyArray_Descr *descr;
+ /* Offset to this entry within row. */
+ npy_intp structured_offset;
+} field_type;
+
+
+void
+field_types_xclear(int num_field_types, field_type *ft);
+
+npy_intp
+field_types_create(PyArray_Descr *descr, field_type **ft);
+
+#endif
diff --git a/numpy/core/src/multiarray/textreading/growth.c b/numpy/core/src/multiarray/textreading/growth.c
new file mode 100644
index 000000000..a38c6d5aa
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/growth.c
@@ -0,0 +1,38 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "templ_common.h"
+
+#include "textreading/growth.h"
+
+
+/*
+ * Helper function taking the size input and growing it (based on min_grow).
+ * It further multiplies it with `itemsize` and ensures that all results fit
+ * into an `npy_intp`.
+ * Returns -1 if any overflow occurred or the result would not fit.
+ * The user has to ensure the input is size_t (i.e. unsigned).
+ */
+npy_intp
+grow_size_and_multiply(size_t *size, size_t min_grow, npy_intp itemsize) {
+ /* min_grow must be a power of two: */
+ assert((min_grow & (min_grow - 1)) == 0);
+ size_t growth = *size >> 2;
+ if (growth <= min_grow) {
+ *size += min_grow;
+ }
+ else {
+ *size += growth + min_grow - 1;
+ *size &= ~min_grow;
+
+ if (*size > NPY_MAX_INTP) {
+ return -1;
+ }
+ }
+
+ npy_intp res;
+ if (npy_mul_with_overflow_intp(&res, (npy_intp)*size, itemsize)) {
+ return -1;
+ }
+ return res;
+}
+
diff --git a/numpy/core/src/multiarray/textreading/growth.h b/numpy/core/src/multiarray/textreading/growth.h
new file mode 100644
index 000000000..debe9a7b3
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/growth.h
@@ -0,0 +1,7 @@
+#ifndef _NPY_GROWTH_H
+#define _NPY_GROWTH_H
+
+npy_intp
+grow_size_and_multiply(size_t *size, size_t min_grow, npy_intp itemsize);
+
+#endif /*_NPY_GROWTH_H */
diff --git a/numpy/core/src/multiarray/textreading/parser_config.h b/numpy/core/src/multiarray/textreading/parser_config.h
new file mode 100644
index 000000000..c60565de1
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/parser_config.h
@@ -0,0 +1,77 @@
+
+#ifndef _PARSER_CONFIG_H_
+#define _PARSER_CONFIG_H_
+
+#include <stdbool.h>
+
+typedef struct {
+ /*
+ * Field delimiter character.
+ * Typically ',', ' ', '\t', ignored if `delimiter_is_whitespace` is true.
+ */
+ Py_UCS4 delimiter;
+
+ /*
+ * Character used to quote fields.
+ * Typically '"' or "'". To disable quoting we set this to UINT_MAX
+ * (which is not a valid unicode character and thus cannot occur in the
+ * file; the same is used for all other characters if necessary).
+ */
+ Py_UCS4 quote;
+
+ /*
+ * Character(s) that indicates the start of a comment.
+ * Typically '#', '%' or ';'.
+ * When encountered in a line and not inside quotes, all character
+ * from the comment character(s) to the end of the line are ignored.
+ */
+ Py_UCS4 comment;
+
+ /*
+ * Ignore whitespace at the beginning of a field (outside/before quotes).
+ * Is (and must be) set if `delimiter_is_whitespace`.
+ */
+ bool ignore_leading_whitespace;
+
+ /*
+ * If true, the delimiter is ignored and any unicode whitespace is used
+ * for splitting (same as `string.split()` in Python). In that case
+ * `ignore_leading_whitespace` should also be set.
+ */
+ bool delimiter_is_whitespace;
+
+ /*
+ * A boolean value (0 or 1). If 1, quoted fields may span
+ * more than one line. For example, the following
+ * 100, 200, "FOO
+ * BAR"
+ * is one "row", containing three fields: 100, 200 and "FOO\nBAR".
+ * If 0, the parser considers an unclosed quote to be an error. (XXX Check!)
+ */
+ bool allow_embedded_newline;
+
+ /*
+ * The imaginary unit character. Default is `j`.
+ */
+ Py_UCS4 imaginary_unit;
+
+ /*
+ * If true, when an integer dtype is given, the field is allowed
+ * to contain a floating point value. It will be cast to the
+ * integer type.
+ */
+ bool allow_float_for_int;
+ /*
+ * Data should be encoded as `latin1` when using python converter
+ * (implementing `loadtxt` default Python 2 compatibility mode).
+ * The c byte converter is used when the user requested `dtype="S"`.
+ * In this case we go via `dtype=object`, however, loadtxt allows latin1
+ * while normal object to string casts only accept ASCII, so it ensures
+ * that that the object array already contains bytes and not strings.
+ */
+ bool python_byte_converters;
+ bool c_byte_converters;
+} parser_config;
+
+
+#endif
diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c
new file mode 100644
index 000000000..750e77b2d
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/readtext.c
@@ -0,0 +1,199 @@
+#include <stdio.h>
+#include <stdbool.h>
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "numpy/arrayobject.h"
+#include "npy_argparse.h"
+#include "conversion_utils.h"
+
+#include "textreading/parser_config.h"
+#include "textreading/stream_pyobject.h"
+#include "textreading/field_types.h"
+#include "textreading/rows.h"
+#include "textreading/str_to_int.h"
+
+
+//
+// `usecols` must point to a Python object that is Py_None or a 1-d contiguous
+// numpy array with data type int32.
+//
+// `dtype` must point to a Python object that is Py_None or a numpy dtype
+// instance. If the latter, code and sizes must be arrays of length
+// num_dtype_fields, holding the flattened data field type codes and byte
+// sizes. (num_dtype_fields, codes, and sizes can be inferred from dtype,
+// but we do that in Python code.)
+//
+// If both `usecols` and `dtype` are not None, and the data type is compound,
+// then len(usecols) must equal num_dtype_fields.
+//
+// If `dtype` is given and it is compound, and `usecols` is None, then the
+// number of columns in the file must match the number of fields in `dtype`.
+//
+static PyObject *
+_readtext_from_stream(stream *s, parser_config *pc,
+ PyObject *usecols, Py_ssize_t skiprows, Py_ssize_t max_rows,
+ PyObject *converters, PyObject *dtype)
+{
+ PyArrayObject *arr = NULL;
+ PyArray_Descr *out_dtype = NULL;
+ int32_t *cols;
+ int ncols;
+ field_type *ft = NULL;
+
+ /* TODO: Find better solution maybe? */
+ if (double_descr == NULL) {
+ double_descr = PyArray_DescrFromType(NPY_DOUBLE);
+ }
+
+ /*
+ * If dtypes[0] is dtype the input was not structured and the result
+ * is considered "homogeneous" and we have to discover the number of
+ * columns/
+ */
+ out_dtype = (PyArray_Descr *)dtype;
+ Py_INCREF(out_dtype);
+
+ npy_intp num_fields = field_types_create(out_dtype, &ft);
+ if (num_fields < 0) {
+ goto finish;
+ }
+ bool homogeneous = num_fields == 1 && ft[0].descr == out_dtype;
+
+ if (usecols == Py_None) {
+ ncols = num_fields;
+ cols = NULL;
+ }
+ else {
+ ncols = PyArray_SIZE((PyArrayObject *)usecols);
+ cols = PyArray_DATA((PyArrayObject *)usecols);
+ }
+
+ arr = read_rows(
+ s, max_rows, num_fields, ft, pc,
+ ncols, cols, skiprows, converters,
+ NULL, out_dtype, homogeneous);
+ if (arr == NULL) {
+ goto finish;
+ }
+
+ finish:
+ Py_XDECREF(out_dtype);
+ field_types_xclear(num_fields, ft);
+ return (PyObject *)arr;
+}
+
+
+static int
+parse_control_character(PyObject *obj, Py_UCS4 *character)
+{
+ if (!PyUnicode_Check(obj) || PyUnicode_GetLength(obj) > 1) {
+ PyErr_Format(PyExc_TypeError,
+ "Control character must be a single unicode character or "
+ "empty unicode string; but got: %.100R", obj);
+ return 0;
+ }
+ if (PyUnicode_GET_LENGTH(obj) == 0) {
+ *character = (Py_UCS4)-1; /* character beyond unicode range */
+ return 1;
+ }
+ *character = PyUnicode_READ_CHAR(obj, 0);
+ return 1;
+}
+
+
+NPY_NO_EXPORT PyObject *
+_load_from_filelike(PyObject *NPY_UNUSED(mod),
+ PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
+{
+ PyObject *file;
+ Py_ssize_t skiprows = 0;
+ Py_ssize_t max_rows = -1;
+ PyObject *usecols = Py_None;
+ PyObject *converters = Py_None;
+
+ PyObject *dtype = Py_None;
+ PyObject *encoding_obj = Py_None;
+ const char *encoding = NULL;
+
+ parser_config pc = {
+ .delimiter = ',',
+ .comment = '#',
+ .quote = '"',
+ .imaginary_unit = 'j',
+ .allow_float_for_int = true,
+ .allow_embedded_newline = true,
+ .delimiter_is_whitespace = false,
+ .ignore_leading_whitespace = false,
+ .python_byte_converters = false,
+ .c_byte_converters = false,
+ };
+ bool filelike = true;
+
+ PyObject *arr = NULL;
+
+ NPY_PREPARE_ARGPARSER;
+ if (npy_parse_arguments("_load_from_filelike", args, len_args, kwnames,
+ "file", NULL, &file,
+ "|delimiter", &parse_control_character, &pc.delimiter,
+ "|comment", &parse_control_character, &pc.comment,
+ "|quote", &parse_control_character, &pc.quote,
+ "|imaginary_unit", &parse_control_character, &pc.imaginary_unit,
+ "|usecols", NULL, &usecols,
+ "|skiprows", &PyArray_IntpFromPyIntConverter, &skiprows,
+ "|max_rows", &PyArray_IntpFromPyIntConverter, &max_rows,
+ "|converters", NULL, &converters,
+ "|dtype", NULL, &dtype,
+ "|encoding", NULL, &encoding_obj,
+ "|filelike", &PyArray_BoolConverter, &filelike,
+ "|byte_converters", &PyArray_BoolConverter, &pc.python_byte_converters,
+ "|c_byte_converters", PyArray_BoolConverter, &pc.c_byte_converters,
+ NULL, NULL, NULL) < 0) {
+ return NULL;
+ }
+
+ if (pc.delimiter == (Py_UCS4)-1) {
+ pc.delimiter_is_whitespace = true;
+ /* Ignore leading whitespace to match `string.split(None)` */
+ pc.ignore_leading_whitespace = true;
+ }
+
+ if (!PyArray_DescrCheck(dtype) ) {
+ PyErr_SetString(PyExc_TypeError,
+ "internal error: dtype must be provided and be a NumPy dtype");
+ return NULL;
+ }
+
+ if (encoding_obj != Py_None) {
+ if (!PyUnicode_Check(encoding_obj)) {
+ PyErr_SetString(PyExc_TypeError,
+ "encoding must be a unicode string.");
+ return NULL;
+ }
+ encoding = PyUnicode_AsUTF8(encoding_obj);
+ if (encoding == NULL) {
+ return NULL;
+ }
+ }
+
+ stream *s;
+ if (filelike) {
+ s = stream_python_file(file, encoding);
+ }
+ else {
+ s = stream_python_iterable(file, encoding);
+ }
+ if (s == NULL) {
+ PyErr_Format(PyExc_RuntimeError, "Unable to access the file.");
+ return NULL;
+ }
+
+ arr = _readtext_from_stream(s, &pc, usecols, skiprows, max_rows,
+ converters, dtype);
+ stream_close(s);
+ return arr;
+}
+
diff --git a/numpy/core/src/multiarray/textreading/readtext.h b/numpy/core/src/multiarray/textreading/readtext.h
new file mode 100644
index 000000000..8c4707368
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/readtext.h
@@ -0,0 +1,7 @@
+#ifndef READTEXT_H_
+#define READTEXT_H_
+
+NPY_NO_EXPORT PyObject *
+_load_from_filelike(PyObject *self, PyObject *args, PyObject *kwargs);
+
+#endif /* READTEXT_H_ */
diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c
new file mode 100644
index 000000000..9301abd5c
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/rows.c
@@ -0,0 +1,438 @@
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "numpy/arrayobject.h"
+#include "numpy/npy_3kcompat.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+#include <math.h>
+#include <stdbool.h>
+
+#include "textreading/stream.h"
+#include "textreading/tokenize.h"
+#include "textreading/conversions.h"
+#include "textreading/field_types.h"
+#include "textreading/rows.h"
+#include "textreading/growth.h"
+
+/*
+ * Minimum size to grow the allcoation by (or 25%). The 8KiB means the actual
+ * growths is within `8 KiB <= size < 16 KiB` (depending on the row size).
+ */
+#define MIN_BLOCK_SIZE (1 << 13)
+
+
+
+/*
+ * Create the array of converter functions from the Python converters.
+ */
+PyObject **
+create_conv_funcs(
+ PyObject *converters, int num_fields, int32_t *usecols)
+{
+ PyObject **conv_funcs = PyMem_Calloc(num_fields, sizeof(PyObject *));
+ if (conv_funcs == NULL) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+ if (converters == Py_None) {
+ return conv_funcs;
+ }
+ else if (!PyDict_Check(converters)) {
+ PyErr_SetString(PyExc_TypeError,
+ "converters must be a dictionary mapping columns to converter "
+ "functions.");
+ return NULL;
+ }
+
+ PyObject *key, *value;
+ Py_ssize_t pos = 0;
+ while (PyDict_Next(converters, &pos, &key, &value)) {
+ Py_ssize_t column = PyNumber_AsSsize_t(key, PyExc_IndexError);
+ if (column == -1 && PyErr_Occurred()) {
+ PyErr_Format(PyExc_TypeError,
+ "keys of the converters dictionary must be integers; "
+ "got %.100R", key);
+ goto error;
+ }
+ if (usecols != NULL) {
+ /*
+ * This code searches for the corresponding usecol. It is
+ * identical to the legacy usecols code, which has two weaknesses:
+ * 1. It fails for duplicated usecols only setting converter for
+ * the first one.
+ * 2. It fails e.g. if usecols uses negative indexing and
+ * converters does not. (This is a feature, since it allows
+ * us to correctly normalize converters to result column here.)
+ */
+ int i = 0;
+ for (; i < num_fields; i++) {
+ if (column == usecols[i]) {
+ column = i;
+ break;
+ }
+ }
+ if (i == num_fields) {
+ continue; /* ignore unused converter */
+ }
+ }
+ else {
+ if (column < -num_fields || column >= num_fields) {
+ PyErr_Format(PyExc_ValueError,
+ "converter specified for column %zd, which is invalid "
+ "for the number of fields %d.", column, num_fields);
+ goto error;
+ }
+ if (column < 0) {
+ column += num_fields;
+ }
+ }
+ if (!PyCallable_Check(value)) {
+ PyErr_Format(PyExc_TypeError,
+ "values of the converters dictionary must be callable, "
+ "but the value associated with key %R is not", key);
+ goto error;
+ }
+ Py_INCREF(value);
+ conv_funcs[column] = value;
+ }
+ return conv_funcs;
+
+ error:
+ for (int i = 0; i < num_fields; i++) {
+ Py_XDECREF(conv_funcs[i]);
+ }
+ PyMem_FREE(conv_funcs);
+ return NULL;
+}
+
+/**
+ * Read a file into the provided array, or create (and possibly grow) an
+ * array to read into.
+ *
+ * @param s The stream object/struct providing reading capabilities used by
+ * the tokenizer.
+ * @param max_rows The number of rows to read, or -1. If negative
+ * all rows are read.
+ * @param num_field_types The number of field types stored in `field_types`.
+ * @param field_types Information about the dtype for each column (or one if
+ * `homogeneous`).
+ * @param pconfig Pointer to the parser config object used by both the
+ * tokenizer and the conversion functions.
+ * @param num_usecols The number of columns in `usecols`.
+ * @param usecols An array of length `num_usecols` or NULL. If given indicates
+ * which column is read for each individual row (negative columns are
+ * accepted).
+ * @param skiplines The number of lines to skip, these lines are ignored.
+ * @param converters Python dictionary of converters. Finalizing converters
+ * is difficult without information about the number of columns.
+ * @param data_array An array to be filled or NULL. In either case a new
+ * reference is returned (the reference to `data_array` is not stolen).
+ * @param out_descr The dtype used for allocating a new array. This is not
+ * used if `data_array` is provided. Note that the actual dtype of the
+ * returned array can differ for strings.
+ * @param num_cols Pointer in which the actual (discovered) number of columns
+ * is returned. This is only relevant if `homogeneous` is true.
+ * @param homogeneous Whether the datatype of the array is not homogeneous,
+ * i.e. not structured. In this case the number of columns has to be
+ * discovered an the returned array will be 2-dimensional rather than
+ * 1-dimensional.
+ *
+ * @returns Returns the result as an array object or NULL on error. The result
+ * is always a new reference (even when `data_array` was passed in).
+ */
+PyArrayObject *
+read_rows(stream *s,
+ npy_intp max_rows, int num_field_types, field_type *field_types,
+ parser_config *pconfig, int num_usecols, int *usecols,
+ Py_ssize_t skiplines, PyObject *converters,
+ PyArrayObject *data_array, PyArray_Descr *out_descr,
+ bool homogeneous)
+{
+ char *data_ptr = NULL;
+ int current_num_fields;
+ size_t row_size = out_descr->elsize;
+ PyObject **conv_funcs = NULL;
+
+ bool needs_init = PyDataType_FLAGCHK(out_descr, NPY_NEEDS_INIT);
+
+ int ndim = homogeneous ? 2 : 1;
+ npy_intp result_shape[2] = {0, 1};
+
+ bool data_array_allocated = data_array == NULL;
+ /* Make sure we own `data_array` for the purpose of error handling */
+ Py_XINCREF(data_array);
+ size_t rows_per_block = 1; /* will be increased depending on row size */
+ Py_ssize_t data_allocated_rows = 0;
+
+ int ts_result = 0;
+ tokenizer_state ts;
+ if (tokenizer_init(&ts, pconfig) < 0) {
+ goto error;
+ }
+
+ /* Set the actual number of fields if it is already known, otherwise -1 */
+ int actual_num_fields = -1;
+ if (usecols != NULL) {
+ actual_num_fields = num_usecols;
+ }
+ else if (!homogeneous) {
+ actual_num_fields = num_field_types;
+ }
+
+ for (; skiplines > 0; skiplines--) {
+ ts.state = TOKENIZE_GOTO_LINE_END;
+ ts_result = tokenize(s, &ts, pconfig);
+ if (ts_result < 0) {
+ goto error;
+ }
+ else if (ts_result != 0) {
+ /* Fewer lines than skiplines is acceptable */
+ break;
+ }
+ }
+
+ Py_ssize_t row_count = 0; /* number of rows actually processed */
+ while ((max_rows < 0 || row_count < max_rows) && ts_result == 0) {
+ ts_result = tokenize(s, &ts, pconfig);
+ if (ts_result < 0) {
+ goto error;
+ }
+ current_num_fields = ts.num_fields;
+ field_info *fields = ts.fields;
+ if (ts.num_fields == 0) {
+ continue; /* Ignore empty line */
+ }
+
+ if (NPY_UNLIKELY(data_ptr == NULL)) {
+ // We've deferred some of the initialization tasks to here,
+ // because we've now read the first line, and we definitively
+ // know how many fields (i.e. columns) we will be processing.
+ if (actual_num_fields == -1) {
+ actual_num_fields = current_num_fields;
+ }
+
+ conv_funcs = create_conv_funcs(
+ converters, actual_num_fields, usecols);
+ if (conv_funcs == NULL) {
+ goto error;
+ }
+
+ /* Note that result_shape[1] is only used if homogeneous is true */
+ result_shape[1] = actual_num_fields;
+ if (homogeneous) {
+ row_size *= actual_num_fields;
+ }
+
+ if (data_array == NULL) {
+ if (max_rows < 0) {
+ /*
+ * Negative max_rows denotes to read the whole file, we
+ * approach this by allocating ever larger blocks.
+ * Adds a number of rows based on `MIN_BLOCK_SIZE`.
+ * Note: later code grows assuming this is a power of two.
+ */
+ if (row_size == 0) {
+ /* actual rows_per_block should not matter here */
+ rows_per_block = 512;
+ }
+ else {
+ /* safe on overflow since min_rows will be 0 or 1 */
+ size_t min_rows = (
+ (MIN_BLOCK_SIZE + row_size - 1) / row_size);
+ while (rows_per_block < min_rows) {
+ rows_per_block *= 2;
+ }
+ }
+ data_allocated_rows = rows_per_block;
+ }
+ else {
+ data_allocated_rows = max_rows;
+ }
+ result_shape[0] = data_allocated_rows;
+ Py_INCREF(out_descr);
+ /*
+ * We do not use Empty, as it would fill with None
+ * and requiring decref'ing if we shrink again.
+ */
+ data_array = (PyArrayObject *)PyArray_SimpleNewFromDescr(
+ ndim, result_shape, out_descr);
+ if (data_array == NULL) {
+ goto error;
+ }
+ if (needs_init) {
+ memset(PyArray_BYTES(data_array), 0, PyArray_NBYTES(data_array));
+ }
+ }
+ else {
+ assert(max_rows >=0);
+ data_allocated_rows = max_rows;
+ }
+ data_ptr = PyArray_BYTES(data_array);
+ }
+
+ if (!usecols && (actual_num_fields != current_num_fields)) {
+ PyErr_Format(PyExc_ValueError,
+ "the number of columns changed from %d to %d at row %zu; "
+ "use `usecols` to select a subset and avoid this error",
+ actual_num_fields, current_num_fields, row_count+1);
+ goto error;
+ }
+
+ if (NPY_UNLIKELY(data_allocated_rows == row_count)) {
+ /*
+ * Grow by ~25% and rounded up to the next rows_per_block
+ * NOTE: This is based on very crude timings and could be refined!
+ */
+ size_t new_rows = data_allocated_rows;
+ npy_intp alloc_size = grow_size_and_multiply(
+ &new_rows, rows_per_block, row_size);
+ if (alloc_size < 0) {
+ /* should normally error much earlier, but make sure */
+ PyErr_SetString(PyExc_ValueError,
+ "array is too big. Cannot read file as a single array; "
+ "providing a maximum number of rows to read may help.");
+ goto error;
+ }
+
+ char *new_data = PyDataMem_RENEW(
+ PyArray_BYTES(data_array), alloc_size ? alloc_size : 1);
+ if (new_data == NULL) {
+ PyErr_NoMemory();
+ goto error;
+ }
+ /* Replace the arrays data since it may have changed */
+ ((PyArrayObject_fields *)data_array)->data = new_data;
+ ((PyArrayObject_fields *)data_array)->dimensions[0] = new_rows;
+ data_ptr = new_data + row_count * row_size;
+ data_allocated_rows = new_rows;
+ if (needs_init) {
+ memset(data_ptr, '\0', (new_rows - row_count) * row_size);
+ }
+ }
+
+ for (int i = 0; i < actual_num_fields; ++i) {
+ int f; /* The field, either 0 (if homogeneous) or i. */
+ int col; /* The column as read, remapped by usecols */
+ char *item_ptr;
+ if (homogeneous) {
+ f = 0;
+ item_ptr = data_ptr + i * field_types[0].descr->elsize;
+ }
+ else {
+ f = i;
+ item_ptr = data_ptr + field_types[f].structured_offset;
+ }
+
+ if (usecols == NULL) {
+ col = i;
+ }
+ else {
+ col = usecols[i];
+ if (col < 0) {
+ // Python-like column indexing: k = -1 means the last column.
+ col += current_num_fields;
+ }
+ if (NPY_UNLIKELY((col < 0) || (col >= current_num_fields))) {
+ PyErr_Format(PyExc_ValueError,
+ "invalid column index %d at row %zu with %d "
+ "columns",
+ usecols[i], current_num_fields, row_count+1);
+ goto error;
+ }
+ }
+
+ bool err = 0;
+ Py_UCS4 *str = ts.field_buffer + fields[col].offset;
+ Py_UCS4 *end = ts.field_buffer + fields[col + 1].offset - 1;
+ if (conv_funcs[i] == NULL) {
+ if (field_types[f].set_from_ucs4(field_types[f].descr,
+ str, end, item_ptr, pconfig) < 0) {
+ err = true;
+ }
+ }
+ else {
+ if (to_generic_with_converter(field_types[f].descr,
+ str, end, item_ptr, pconfig, conv_funcs[i]) < 0) {
+ err = true;
+ }
+ }
+
+ if (NPY_UNLIKELY(err)) {
+ PyObject *exc, *val, *tb;
+ PyErr_Fetch(&exc, &val, &tb);
+
+ size_t length = end - str;
+ PyObject *string = PyUnicode_FromKindAndData(
+ PyUnicode_4BYTE_KIND, str, length);
+ if (string == NULL) {
+ npy_PyErr_ChainExceptions(exc, val, tb);
+ goto error;
+ }
+ PyErr_Format(PyExc_ValueError,
+ "could not convert string %.100R to %S at "
+ "row %zu, column %d.",
+ string, field_types[f].descr, row_count, col+1);
+ Py_DECREF(string);
+ npy_PyErr_ChainExceptionsCause(exc, val, tb);
+ goto error;
+ }
+ }
+
+ ++row_count;
+ data_ptr += row_size;
+ }
+
+ tokenizer_clear(&ts);
+ PyMem_FREE(conv_funcs);
+
+ if (data_array == NULL) {
+ assert(row_count == 0 && result_shape[0] == 0);
+ if (actual_num_fields == -1) {
+ /*
+ * We found no rows and have to discover the number of elements
+ * we have no choice but to guess 1.
+ * NOTE: It may make sense to move this outside of here to refine
+ * the behaviour where necessary.
+ */
+ result_shape[1] = 1;
+ }
+ else {
+ result_shape[1] = actual_num_fields;
+ }
+ Py_INCREF(out_descr);
+ data_array = (PyArrayObject *)PyArray_Empty(
+ ndim, result_shape, out_descr, 0);
+ }
+
+ /*
+ * Note that if there is no data, `data_array` may still be NULL and
+ * row_count is 0. In that case, always realloc just in case.
+ */
+ if (data_array_allocated && data_allocated_rows != row_count) {
+ size_t size = row_count * row_size;
+ char *new_data = PyDataMem_RENEW(
+ PyArray_BYTES(data_array), size ? size : 1);
+ if (new_data == NULL) {
+ Py_DECREF(data_array);
+ PyErr_NoMemory();
+ return NULL;
+ }
+ ((PyArrayObject_fields *)data_array)->data = new_data;
+ ((PyArrayObject_fields *)data_array)->dimensions[0] = row_count;
+ }
+
+ return data_array;
+
+ error:
+ PyMem_FREE(conv_funcs);
+ tokenizer_clear(&ts);
+ Py_XDECREF(data_array);
+ return NULL;
+}
diff --git a/numpy/core/src/multiarray/textreading/rows.h b/numpy/core/src/multiarray/textreading/rows.h
new file mode 100644
index 000000000..773e0f8e0
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/rows.h
@@ -0,0 +1,22 @@
+
+#ifndef _ROWS_H_
+#define _ROWS_H_
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <stdio.h>
+
+#include "textreading/stream.h"
+#include "textreading/field_types.h"
+#include "textreading/parser_config.h"
+
+
+PyArrayObject *
+read_rows(stream *s,
+ npy_intp nrows, int num_field_types, field_type *field_types,
+ parser_config *pconfig, int num_usecols, int *usecols,
+ Py_ssize_t skiplines, PyObject *converters,
+ PyArrayObject *data_array, PyArray_Descr *out_descr,
+ bool homogeneous);
+
+#endif
diff --git a/numpy/core/src/multiarray/textreading/str_to_int.c b/numpy/core/src/multiarray/textreading/str_to_int.c
new file mode 100644
index 000000000..647e79a4f
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/str_to_int.c
@@ -0,0 +1,87 @@
+
+#include <Python.h>
+
+#include <string.h>
+#include "textreading/str_to_int.h"
+#include "textreading/conversions.h"
+#include "textreading/parser_config.h"
+
+
+NPY_NO_EXPORT PyArray_Descr *double_descr = NULL;
+
+// TODO: The float fallbacks are seriously awkward, why? Or at least why this way?
+#define DECLARE_TO_INT(intw, INT_MIN, INT_MAX) \
+ int \
+ to_##intw(PyArray_Descr *descr, \
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \
+ parser_config *pconfig) \
+ { \
+ int64_t parsed; \
+ intw##_t x; \
+ \
+ if (str_to_int64(str, end, INT_MIN, INT_MAX, &parsed) < 0) { \
+ if (pconfig->allow_float_for_int) { \
+ double fx; \
+ if (to_double(double_descr, str, end, (char *)&fx, pconfig) < 0) { \
+ return -1; \
+ } \
+ else { \
+ x = (intw##_t) fx; \
+ } \
+ } \
+ else { \
+ return -1; \
+ } \
+ } \
+ else { \
+ x = (intw##_t)parsed; \
+ } \
+ memcpy(dataptr, &x, sizeof(x)); \
+ if (!PyArray_ISNBO(descr->byteorder)) { \
+ descr->f->copyswap(dataptr, dataptr, 1, NULL); \
+ } \
+ return 0; \
+ }
+
+#define DECLARE_TO_UINT(uintw, UINT_MAX) \
+ int \
+ to_##uintw(PyArray_Descr *descr, \
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \
+ parser_config *pconfig) \
+ { \
+ uint64_t parsed; \
+ uintw##_t x; \
+ \
+ if (str_to_uint64(str, end, UINT_MAX, &parsed) < 0) { \
+ if (pconfig->allow_float_for_int) { \
+ double fx; \
+ if (to_double(double_descr, str, end, (char *)&fx, pconfig) < 0) { \
+ return -1; \
+ } \
+ else { \
+ x = (uintw##_t) fx; \
+ } \
+ } \
+ else { \
+ return -1; \
+ } \
+ } \
+ else { \
+ x = (uintw##_t)parsed; \
+ } \
+ memcpy(dataptr, &x, sizeof(x)); \
+ if (!PyArray_ISNBO(descr->byteorder)) { \
+ descr->f->copyswap(dataptr, dataptr, 1, NULL); \
+ } \
+ return 0; \
+ }
+
+DECLARE_TO_INT(int8, INT8_MIN, INT8_MAX)
+DECLARE_TO_INT(int16, INT16_MIN, INT16_MAX)
+DECLARE_TO_INT(int32, INT32_MIN, INT32_MAX)
+DECLARE_TO_INT(int64, INT64_MIN, INT64_MAX)
+
+DECLARE_TO_UINT(uint8, UINT8_MAX)
+DECLARE_TO_UINT(uint16, UINT16_MAX)
+DECLARE_TO_UINT(uint32, UINT32_MAX)
+DECLARE_TO_UINT(uint64, UINT64_MAX)
diff --git a/numpy/core/src/multiarray/textreading/str_to_int.h b/numpy/core/src/multiarray/textreading/str_to_int.h
new file mode 100644
index 000000000..9cead56f0
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/str_to_int.h
@@ -0,0 +1,175 @@
+#ifndef STR_TO_INT_H
+#define STR_TO_INT_H
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "numpy/ndarraytypes.h"
+
+#include "textreading/parser_config.h"
+
+extern NPY_NO_EXPORT PyArray_Descr *double_descr;
+
+/*
+ * The following two string conversion functions are largely equivalent
+ * in Pandas. They are in the header file here, to ensure they can be easily
+ * inline in the other function.
+ * Unlike pandas, pass in end-pointer (do not rely on \0) and return 0 or -1.
+ *
+ * The actual functions are defined using macro templating below.
+ */
+static NPY_INLINE int
+str_to_int64(
+ const Py_UCS4 *p_item, const Py_UCS4 *p_end,
+ int64_t int_min, int64_t int_max, int64_t *result)
+{
+ const Py_UCS4 *p = (const Py_UCS4 *)p_item;
+ bool isneg = 0;
+ int64_t number = 0;
+
+ // Skip leading spaces.
+ while (Py_UNICODE_ISSPACE(*p)) {
+ ++p;
+ }
+
+ // Handle sign.
+ if (*p == '-') {
+ isneg = true;
+ ++p;
+ }
+ else if (*p == '+') {
+ p++;
+ }
+
+ // Check that there is a first digit.
+ if (!isdigit(*p)) {
+ return -1;
+ }
+
+ if (isneg) {
+ // If number is greater than pre_min, at least one more digit
+ // can be processed without overflowing.
+ int dig_pre_min = -(int_min % 10);
+ int64_t pre_min = int_min / 10;
+
+ // Process the digits.
+ int d = *p;
+ while (isdigit(d)) {
+ if ((number > pre_min) || ((number == pre_min) && (d - '0' <= dig_pre_min))) {
+ number = number * 10 - (d - '0');
+ d = *++p;
+ }
+ else {
+ return -1;
+ }
+ }
+ }
+ else {
+ // If number is less than pre_max, at least one more digit
+ // can be processed without overflowing.
+ int64_t pre_max = int_max / 10;
+ int dig_pre_max = int_max % 10;
+
+ // Process the digits.
+ int d = *p;
+ while (isdigit(d)) {
+ if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) {
+ number = number * 10 + (d - '0');
+ d = *++p;
+ }
+ else {
+ return -1;
+ }
+ }
+ }
+
+ // Skip trailing spaces.
+ while (Py_UNICODE_ISSPACE(*p)) {
+ ++p;
+ }
+
+ // Did we use up all the characters?
+ if (p != p_end) {
+ return -1;
+ }
+
+ *result = number;
+ return 0;
+}
+
+
+static NPY_INLINE int
+str_to_uint64(
+ const Py_UCS4 *p_item, const Py_UCS4 *p_end,
+ uint64_t uint_max, uint64_t *result)
+{
+ const Py_UCS4 *p = (const Py_UCS4 *)p_item;
+ uint64_t number = 0;
+ int d;
+
+ // Skip leading spaces.
+ while (Py_UNICODE_ISSPACE(*p)) {
+ ++p;
+ }
+
+ // Handle sign.
+ if (*p == '-') {
+ return -1;
+ }
+ if (*p == '+') {
+ p++;
+ }
+
+ // Check that there is a first digit.
+ if (!isdigit(*p)) {
+ return -1;
+ }
+
+ // If number is less than pre_max, at least one more digit
+ // can be processed without overflowing.
+ uint64_t pre_max = uint_max / 10;
+ int dig_pre_max = uint_max % 10;
+
+ // Process the digits.
+ d = *p;
+ while (isdigit(d)) {
+ if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) {
+ number = number * 10 + (d - '0');
+ d = *++p;
+ }
+ else {
+ return -1;
+ }
+ }
+
+ // Skip trailing spaces.
+ while (Py_UNICODE_ISSPACE(*p)) {
+ ++p;
+ }
+
+ // Did we use up all the characters?
+ if (p != p_end) {
+ return -1;
+ }
+
+ *result = number;
+ return 0;
+}
+
+
+#define DECLARE_TO_INT_PROTOTYPE(intw) \
+ int \
+ to_##intw(PyArray_Descr *descr, \
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \
+ parser_config *pconfig);
+
+DECLARE_TO_INT_PROTOTYPE(int8)
+DECLARE_TO_INT_PROTOTYPE(int16)
+DECLARE_TO_INT_PROTOTYPE(int32)
+DECLARE_TO_INT_PROTOTYPE(int64)
+
+DECLARE_TO_INT_PROTOTYPE(uint8)
+DECLARE_TO_INT_PROTOTYPE(uint16)
+DECLARE_TO_INT_PROTOTYPE(uint32)
+DECLARE_TO_INT_PROTOTYPE(uint64)
+
+#endif
diff --git a/numpy/core/src/multiarray/textreading/stream.h b/numpy/core/src/multiarray/textreading/stream.h
new file mode 100644
index 000000000..0c4567329
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/stream.h
@@ -0,0 +1,29 @@
+#ifndef _STREAM_H_
+#define _STREAM_H_
+
+#include <stdint.h>
+
+/*
+ * When getting the next line, we hope that the buffer provider can already
+ * give some information about the newlines, because for Python iterables
+ * we definitely expect to get line-by-line buffers.
+ */
+#define BUFFER_MAY_CONTAIN_NEWLINE 0
+#define BUFFER_IS_PARTIAL_LINE 1
+#define BUFFER_IS_LINEND 2
+#define BUFFER_IS_FILEEND 3
+
+typedef struct _stream {
+ void *stream_data;
+ int (*stream_nextbuf)(void *sdata, char **start, char **end, int *kind);
+ // Note that the first argument to stream_close is the stream pointer
+ // itself, not the stream_data pointer.
+ int (*stream_close)(struct _stream *strm);
+} stream;
+
+
+#define stream_nextbuf(s, start, end, kind) \
+ ((s)->stream_nextbuf((s)->stream_data, start, end, kind))
+#define stream_close(s) ((s)->stream_close((s)))
+
+#endif
diff --git a/numpy/core/src/multiarray/textreading/stream_pyobject.c b/numpy/core/src/multiarray/textreading/stream_pyobject.c
new file mode 100644
index 000000000..ccc902657
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/stream_pyobject.c
@@ -0,0 +1,271 @@
+/*
+ * C side structures to provide capabilities to read Python file like objects
+ * in chunks, or iterate through iterables with each result representing a
+ * single line of a file.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "numpy/arrayobject.h"
+
+#include "textreading/stream.h"
+
+#define READ_CHUNKSIZE 1 << 14
+
+
+typedef struct {
+ /* The Python file object being read. */
+ PyObject *file;
+
+ /* The `read` attribute of the file object. */
+ PyObject *read;
+ /* Amount to read each time we call `obj.read()` */
+ PyObject *chunksize;
+
+ /* file position when the file_buffer was created. */
+ off_t initial_file_pos;
+
+ /* Python str object holding the line most recently read from the file. */
+ PyObject *chunk;
+
+ /* Encoding compatible with Python's `PyUnicode_Encode` (may be NULL) */
+ const char *encoding;
+} python_chunks_from_file;
+
+
+/*
+ * Helper function to support byte objects as well as unicode strings.
+ *
+ * NOTE: Steals a reference to `str` (although usually returns it unmodified).
+ */
+static NPY_INLINE PyObject *
+process_stringlike(PyObject *str, const char *encoding)
+{
+ if (PyBytes_Check(str)) {
+ PyObject *ustr;
+ ustr = PyUnicode_FromEncodedObject(str, encoding, NULL);
+ if (ustr == NULL) {
+ return NULL;
+ }
+ Py_DECREF(str);
+ return ustr;
+ }
+ else if (!PyUnicode_Check(str)) {
+ PyErr_SetString(PyExc_TypeError,
+ "non-string returned while reading data");
+ Py_DECREF(str);
+ return NULL;
+ }
+ return str;
+}
+
+
+static NPY_INLINE void
+buffer_info_from_unicode(PyObject *str, char **start, char **end, int *kind)
+{
+ Py_ssize_t length = PyUnicode_GET_LENGTH(str);
+ *kind = PyUnicode_KIND(str);
+
+ if (*kind == PyUnicode_1BYTE_KIND) {
+ *start = (char *)PyUnicode_1BYTE_DATA(str);
+ }
+ else if (*kind == PyUnicode_2BYTE_KIND) {
+ *start = (char *)PyUnicode_2BYTE_DATA(str);
+ length *= sizeof(Py_UCS2);
+ }
+ else if (*kind == PyUnicode_4BYTE_KIND) {
+ *start = (char *)PyUnicode_4BYTE_DATA(str);
+ length *= sizeof(Py_UCS4);
+ }
+ *end = *start + length;
+}
+
+
+static int
+fb_nextbuf(python_chunks_from_file *fb, char **start, char **end, int *kind)
+{
+ Py_XDECREF(fb->chunk);
+ fb->chunk = NULL;
+
+ PyObject *chunk = PyObject_CallFunctionObjArgs(fb->read, fb->chunksize, NULL);
+ if (chunk == NULL) {
+ return -1;
+ }
+ fb->chunk = process_stringlike(chunk, fb->encoding);
+ if (fb->chunk == NULL) {
+ return -1;
+ }
+ buffer_info_from_unicode(fb->chunk, start, end, kind);
+ if (*start == *end) {
+ return BUFFER_IS_FILEEND;
+ }
+ return BUFFER_MAY_CONTAIN_NEWLINE;
+}
+
+
+static int
+fb_del(stream *strm)
+{
+ python_chunks_from_file *fb = (python_chunks_from_file *)strm->stream_data;
+
+ Py_XDECREF(fb->file);
+ Py_XDECREF(fb->read);
+ Py_XDECREF(fb->chunksize);
+ Py_XDECREF(fb->chunk);
+
+ free(fb);
+ free(strm);
+
+ return 0;
+}
+
+
+stream *
+stream_python_file(PyObject *obj, const char *encoding)
+{
+ python_chunks_from_file *fb;
+ stream *strm;
+
+ fb = (python_chunks_from_file *) malloc(sizeof(python_chunks_from_file));
+ if (fb == NULL) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+
+ fb->file = NULL;
+ fb->read = NULL;
+ fb->chunksize = NULL;
+ fb->chunk = NULL;
+ fb->encoding = encoding;
+
+ strm = (stream *) malloc(sizeof(stream));
+ if (strm == NULL) {
+ PyErr_NoMemory();
+ free(fb);
+ return NULL;
+ }
+
+ fb->file = obj;
+ Py_INCREF(fb->file);
+
+ fb->read = PyObject_GetAttrString(obj, "read");
+ if (fb->read == NULL) {
+ goto fail;
+ }
+ fb->chunksize = PyLong_FromLong(READ_CHUNKSIZE);
+ if (fb->chunksize == NULL) {
+ goto fail;
+ }
+
+ strm->stream_data = (void *)fb;
+ strm->stream_nextbuf = (void *)&fb_nextbuf;
+ strm->stream_close = &fb_del;
+
+ return strm;
+
+fail:
+ fb_del(strm);
+ return NULL;
+}
+
+
+/*
+ * Stream from a Python iterable by interpreting each item as a line in a file
+ */
+typedef struct {
+ /* The Python file object being read. */
+ PyObject *iterator;
+
+ /* Python str object holding the line most recently fetched */
+ PyObject *line;
+
+ /* Encoding compatible with Python's `PyUnicode_Encode` (may be NULL) */
+ const char *encoding;
+} python_lines_from_iterator;
+
+
+static int
+it_del(stream *strm)
+{
+ python_lines_from_iterator *it = (python_lines_from_iterator *)strm->stream_data;
+
+ Py_XDECREF(it->iterator);
+ Py_XDECREF(it->line);
+
+ free(it);
+ free(strm);
+
+ return 0;
+}
+
+
+static int
+it_nextbuf(python_lines_from_iterator *it, char **start, char **end, int *kind)
+{
+ Py_XDECREF(it->line);
+ it->line = NULL;
+
+ PyObject *line = PyIter_Next(it->iterator);
+ if (line == NULL) {
+ if (PyErr_Occurred()) {
+ return -1;
+ }
+ *start = NULL;
+ *end = NULL;
+ return BUFFER_IS_FILEEND;
+ }
+ it->line = process_stringlike(line, it->encoding);
+ if (it->line == NULL) {
+ return -1;
+ }
+
+ buffer_info_from_unicode(it->line, start, end, kind);
+ return BUFFER_IS_LINEND;
+}
+
+
+stream *
+stream_python_iterable(PyObject *obj, const char *encoding)
+{
+ python_lines_from_iterator *it;
+ stream *strm;
+
+ it = (python_lines_from_iterator *)malloc(sizeof(*it));
+ if (it == NULL) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+
+ it->iterator = NULL;
+ it->line = NULL;
+ it->encoding = encoding;
+
+ strm = (stream *) malloc(sizeof(stream));
+ if (strm == NULL) {
+ PyErr_NoMemory();
+ free(it);
+ return NULL;
+ }
+ if (!PyIter_Check(obj)) {
+ PyErr_SetString(PyExc_TypeError,
+ "error reading from object, expected an iterable.");
+ goto fail;
+ }
+ Py_INCREF(obj);
+ it->iterator = obj;
+
+ strm->stream_data = (void *)it;
+ strm->stream_nextbuf = (void *)&it_nextbuf;
+ strm->stream_close = &it_del;
+
+ return strm;
+
+fail:
+ it_del(strm);
+ return NULL;
+}
diff --git a/numpy/core/src/multiarray/textreading/stream_pyobject.h b/numpy/core/src/multiarray/textreading/stream_pyobject.h
new file mode 100644
index 000000000..93357e352
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/stream_pyobject.h
@@ -0,0 +1,16 @@
+
+#ifndef _STREAM_PYTHON_FILE_BY_LINE
+#define _STREAM_PYTHON_FILE_BY_LINE
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#include "textreading/stream.h"
+
+stream *
+stream_python_file(PyObject *obj, const char *encoding);
+
+stream *
+stream_python_iterable(PyObject *obj, const char *encoding);
+
+#endif
diff --git a/numpy/core/src/multiarray/textreading/tokenize.c.src b/numpy/core/src/multiarray/textreading/tokenize.c.src
new file mode 100644
index 000000000..dcddb1b36
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/tokenize.c.src
@@ -0,0 +1,449 @@
+
+#include <Python.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "numpy/ndarraytypes.h"
+
+#include "textreading/stream.h"
+#include "textreading/tokenize.h"
+#include "textreading/parser_config.h"
+#include "textreading/growth.h"
+
+
+/*
+ How parsing quoted fields works:
+
+ For quoting to be activated, the first character of the field
+ must be the quote character (after taking into account
+ ignore_leading_spaces). While quoting is active, delimiters
+ are treated as regular characters, not delimiters. Quoting is
+ deactivated by the second occurrence of the quote character. An
+ exception is the occurrence of two consecutive quote characters,
+ which is treated as a literal occurrence of a single quote character.
+ E.g. (with delimiter=',' and quote='"'):
+ 12.3,"New York, NY","3'2"""
+ The second and third fields are `New York, NY` and `3'2"`.
+
+ If a non-delimiter occurs after the closing quote, the quote is
+ ignored and parsing continues with quoting deactivated. Quotes
+ that occur while quoting is not activated are not handled specially;
+ they become part of the data.
+ E.g:
+ 12.3,"ABC"DEF,XY"Z
+ The second and third fields are `ABCDEF` and `XY"Z`.
+
+ Note that the second field of
+ 12.3,"ABC" ,4.5
+ is `ABC `. Currently there is no option to ignore whitespace
+ at the end of a field.
+*/
+
+
+/**begin repeat
+ * #type = Py_UCS1, Py_UCS2, Py_UCS4#
+ */
+static NPY_INLINE int
+copy_to_field_buffer_@type@(tokenizer_state *ts,
+ const @type@ *chunk_start, const @type@ *chunk_end)
+{
+ size_t chunk_length = chunk_end - chunk_start;
+ size_t size = chunk_length + ts->field_buffer_pos + 2;
+
+ if (NPY_UNLIKELY(ts->field_buffer_length < size)) {
+ npy_intp alloc_size = grow_size_and_multiply(&size, 32, sizeof(Py_UCS4));
+ if (alloc_size < 0) {
+ PyErr_Format(PyExc_ValueError,
+ "line too long to handle while reading file.");
+ return -1;
+ }
+ Py_UCS4 *grown = PyMem_Realloc(ts->field_buffer, alloc_size);
+ if (grown == NULL) {
+ PyErr_NoMemory();
+ return -1;
+ }
+ ts->field_buffer_length = size;
+ ts->field_buffer = grown;
+ }
+
+ Py_UCS4 *write_pos = ts->field_buffer + ts->field_buffer_pos;
+ for (; chunk_start < chunk_end; chunk_start++, write_pos++) {
+ *write_pos = (Py_UCS4)*chunk_start;
+ }
+ *write_pos = '\0'; /* always ensure we end with NUL */
+ ts->field_buffer_pos += chunk_length;
+ return 0;
+}
+/**end repeat**/
+
+
+static NPY_INLINE int
+add_field(tokenizer_state *ts)
+{
+ /* The previous field is done, advance to keep a NUL byte at the end */
+ ts->field_buffer_pos += 1;
+
+ if (NPY_UNLIKELY((size_t)ts->num_fields + 1 > ts->fields_size)) {
+ size_t size = (size_t)ts->num_fields;
+
+ npy_intp alloc_size = grow_size_and_multiply(
+ &size, 4, sizeof(field_info));
+ if (alloc_size < 0) {
+ /* Check for a size overflow, path should be almost impossible. */
+ PyErr_Format(PyExc_ValueError,
+ "too many columns found; cannot read file.");
+ return -1;
+ }
+ field_info *fields = PyMem_Realloc(ts->fields, alloc_size);
+ if (fields == NULL) {
+ PyErr_NoMemory();
+ return -1;
+ }
+ ts->fields = fields;
+ ts->fields_size = size;
+ }
+
+ ts->fields[ts->num_fields].offset = ts->field_buffer_pos;
+ ts->fields[ts->num_fields].quoted = false;
+ ts->num_fields += 1;
+ /* Ensure this (currently empty) word is NUL terminated. */
+ ts->field_buffer[ts->field_buffer_pos] = '\0';
+ return 0;
+}
+
+
+/**begin repeat
+ * #kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND, PyUnicode_4BYTE_KIND#
+ * #type = Py_UCS1, Py_UCS2, Py_UCS4#
+ */
+static NPY_INLINE int
+tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config)
+{
+ @type@ *pos = (@type@ *)ts->pos;
+ @type@ *stop = (@type@ *)ts->end;
+ @type@ *chunk_start;
+
+ if (ts->state == TOKENIZE_CHECK_QUOTED) {
+ /* before we can check for quotes, strip leading whitespace */
+ if (config->ignore_leading_whitespace) {
+ while (pos < stop && Py_UNICODE_ISSPACE(*pos) &&
+ *pos != '\r' && *pos != '\n') {
+ pos++;
+ }
+ if (pos == stop) {
+ ts->pos = (char *)pos;
+ return 0;
+ }
+ }
+
+ /* Setting chunk effectively starts the field */
+ if (*pos == config->quote) {
+ ts->fields[ts->num_fields - 1].quoted = true;
+ ts->state = TOKENIZE_QUOTED;
+ pos++; /* TOKENIZE_QUOTED is OK with pos == stop */
+ }
+ else {
+ /* Set to TOKENIZE_QUOTED or TOKENIZE_QUOTED_WHITESPACE */
+ ts->state = ts->unquoted_state;
+ }
+ }
+
+ switch (ts->state) {
+ case TOKENIZE_UNQUOTED:
+ chunk_start = pos;
+ for (; pos < stop; pos++) {
+ if (*pos == '\r') {
+ ts->state = TOKENIZE_EAT_CRLF;
+ break;
+ }
+ else if (*pos == '\n') {
+ ts->state = TOKENIZE_LINE_END;
+ break;
+ }
+ else if (*pos == config->delimiter) {
+ ts->state = TOKENIZE_INIT;
+ break;
+ }
+ else if (*pos == config->comment) {
+ ts->state = TOKENIZE_GOTO_LINE_END;
+ break;
+ }
+ }
+ if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) {
+ return -1;
+ }
+ pos++;
+ break;
+
+ case TOKENIZE_UNQUOTED_WHITESPACE:
+ /* Note, this branch is largely identical to `TOKENIZE_UNQUOTED` */
+ chunk_start = pos;
+ for (; pos < stop; pos++) {
+ if (*pos == '\r') {
+ ts->state = TOKENIZE_EAT_CRLF;
+ break;
+ }
+ else if (*pos == '\n') {
+ ts->state = TOKENIZE_LINE_END;
+ break;
+ }
+ else if (Py_UNICODE_ISSPACE(*pos)) {
+ ts->state = TOKENIZE_INIT;
+ break;
+ }
+ else if (*pos == config->comment) {
+ ts->state = TOKENIZE_GOTO_LINE_END;
+ break;
+ }
+ }
+ if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) {
+ return -1;
+ }
+ pos++;
+ break;
+
+ case TOKENIZE_QUOTED:
+ chunk_start = pos;
+ for (; pos < stop; pos++) {
+ if (!config->allow_embedded_newline) {
+ if (*pos == '\r') {
+ ts->state = TOKENIZE_EAT_CRLF;
+ break;
+ }
+ else if (*pos == '\n') {
+ ts->state = TOKENIZE_LINE_END;
+ break;
+ }
+ }
+ else if (*pos != config->quote) {
+ /* inside the field, nothing to do. */
+ }
+ else {
+ ts->state = TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE;
+ break;
+ }
+ }
+ if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) {
+ return -1;
+ }
+ pos++;
+ break;
+
+ case TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE:
+ if (*pos == config->quote) {
+ ts->state = TOKENIZE_QUOTED;
+ pos++;
+ }
+ else {
+ /* continue parsing as if unquoted */
+ ts->state = TOKENIZE_UNQUOTED;
+ }
+ break;
+
+ case TOKENIZE_GOTO_LINE_END:
+ if (ts->buf_state != BUFFER_MAY_CONTAIN_NEWLINE) {
+ pos = stop; /* advance to next buffer */
+ ts->state = TOKENIZE_LINE_END;
+ break;
+ }
+ for (; pos < stop; pos++) {
+ if (*pos == '\r') {
+ ts->state = TOKENIZE_EAT_CRLF;
+ break;
+ }
+ else if (*pos == '\n') {
+ ts->state = TOKENIZE_LINE_END;
+ break;
+ }
+ }
+ pos++;
+ break;
+
+ case TOKENIZE_EAT_CRLF:
+ /* "Universal newline" support: remove \n in \r\n. */
+ if (*pos == '\n') {
+ pos++;
+ }
+ ts->state = TOKENIZE_LINE_END;
+ break;
+
+ default:
+ assert(0);
+ }
+
+ ts->pos = (char *)pos;
+ return 0;
+}
+/**end repeat**/
+
+
+/*
+ * This version now always copies the full "row" (all tokens). This makes
+ * two things easier:
+ * 1. It means that every word is guaranteed to be followed by a NUL character
+ * (although it can include one as well).
+ * 2. In the usecols case we can sniff the first row easier by parsing it
+ * fully.
+ *
+ * The tokenizer could grow the ability to skip fields and check the
+ * maximum number of fields when known.
+ *
+ * Unlike other tokenizers, this one tries to work in chunks and copies
+ * data to words only when it it has to. The hope is that this makes multiple
+ * light-weight loops rather than a single heavy one, to allow e.g. quickly
+ * scanning for the end of a field.
+ */
+int
+tokenize(stream *s, tokenizer_state *ts, parser_config *const config)
+{
+ assert(ts->fields_size >= 2);
+ assert(ts->field_buffer_length >= 2*sizeof(Py_UCS4));
+
+ int finished_reading_file = 0;
+
+ /* Reset to start of buffer */
+ ts->field_buffer_pos = 0;
+ ts->num_fields = 0;
+ /* Add the first field */
+
+ while (1) {
+ if (ts->state == TOKENIZE_INIT) {
+ /* Start a new field */
+ if (add_field(ts) < 0) {
+ return -1;
+ }
+ ts->state = TOKENIZE_CHECK_QUOTED;
+ }
+
+ if (NPY_UNLIKELY(ts->pos >= ts->end)) {
+ if (ts->buf_state == BUFFER_IS_LINEND &&
+ ts->state != TOKENIZE_QUOTED &&
+ ts->state != TOKENIZE_CHECK_QUOTED) {
+ /*
+ * Finished line, do not read anymore (also do not eat \n).
+ * If we are in a quoted field and the "line" does not end with
+ * a newline, the quoted field will be missing it right now.
+ * TODO: We should probably just insert a "\n" character here,
+ * which is also closer to what the python code did
+ * (either by setting pos/end or manually).
+ */
+ goto finish;
+ }
+ /* fetch new data */
+ ts->buf_state = stream_nextbuf(s,
+ &ts->pos, &ts->end, &ts->unicode_kind);
+ if (ts->buf_state < 0) {
+ return -1;
+ }
+ if (ts->buf_state == BUFFER_IS_FILEEND) {
+ finished_reading_file = 1;
+ ts->pos = ts->end; /* should be guaranteed, but make sure. */
+ goto finish;
+ }
+ else if (ts->pos == ts->end) {
+ if (ts->buf_state != BUFFER_IS_LINEND) {
+ PyErr_SetString(PyExc_RuntimeError,
+ "Reader returned an empty buffer, "
+ "but did not indicate file or line end.");
+ return -1;
+ }
+ /* Otherwise, we are OK with this and assume an empty line. */
+ goto finish;
+ }
+ }
+ int status;
+ if (ts->unicode_kind == PyUnicode_1BYTE_KIND) {
+ status = tokenizer_core_Py_UCS1(ts, config);
+ }
+ else if (ts->unicode_kind == PyUnicode_2BYTE_KIND) {
+ status = tokenizer_core_Py_UCS2(ts, config);
+ }
+ else {
+ assert(ts->unicode_kind == PyUnicode_4BYTE_KIND);
+ status = tokenizer_core_Py_UCS4(ts, config);
+ }
+ if (status < 0) {
+ return -1;
+ }
+
+ if (ts->state == TOKENIZE_LINE_END) {
+ goto finish;
+ }
+ }
+
+ finish:
+ /* Finish the last field */
+ if (add_field(ts) < 0) {
+ return -1;
+ }
+ ts->num_fields -= 1;
+ /*
+ * If have one field, but that field is completely empty, this is an
+ * empty line, and we just ignore it.
+ */
+ if (ts->num_fields == 1
+ && ts->fields[1].offset - ts->fields[0].offset == 1
+ && !ts->fields->quoted) {
+ ts->num_fields--;
+ }
+ ts->state = TOKENIZE_INIT;
+ return finished_reading_file;
+}
+
+
+void
+tokenizer_clear(tokenizer_state *ts)
+{
+ PyMem_FREE(ts->field_buffer);
+ ts->field_buffer = NULL;
+ ts->field_buffer_length = 0;
+
+ PyMem_FREE(ts->fields);
+ ts->fields = NULL;
+ ts->fields_size = 0;
+}
+
+
+/*
+ * Initialize the tokenizer. We may want to copy all important config
+ * variables into the tokenizer. This would improve the cache locality during
+ * tokenizing.
+ */
+int
+tokenizer_init(tokenizer_state *ts, parser_config *config)
+{
+ /* State and buf_state could be moved into tokenize if we go by row */
+ ts->buf_state = BUFFER_MAY_CONTAIN_NEWLINE;
+ ts->state = TOKENIZE_INIT;
+ if (config->delimiter_is_whitespace) {
+ ts->unquoted_state = TOKENIZE_UNQUOTED_WHITESPACE;
+ }
+ else {
+ ts->unquoted_state = TOKENIZE_UNQUOTED;
+ }
+ ts->num_fields = 0;
+
+ ts->buf_state = 0;
+ ts->pos = NULL;
+ ts->end = NULL;
+
+ ts->field_buffer = PyMem_Malloc(32 * sizeof(Py_UCS4));
+ if (ts->field_buffer == NULL) {
+ PyErr_NoMemory();
+ return -1;
+ }
+ ts->field_buffer_length = 32;
+
+ ts->fields = PyMem_Malloc(4 * sizeof(*ts->fields));
+ if (ts->fields == NULL) {
+ PyErr_NoMemory();
+ return -1;
+ }
+ ts->fields_size = 4;
+ return 0;
+}
diff --git a/numpy/core/src/multiarray/textreading/tokenize.h b/numpy/core/src/multiarray/textreading/tokenize.h
new file mode 100644
index 000000000..aeac63107
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/tokenize.h
@@ -0,0 +1,77 @@
+
+#ifndef _TOKENIZE_H_
+#define _TOKENIZE_H_
+
+#include <Python.h>
+#include "textreading/stream.h"
+#include "textreading/parser_config.h"
+
+
+typedef enum {
+ /* Initialization of fields */
+ TOKENIZE_INIT,
+ TOKENIZE_CHECK_QUOTED,
+ /* Main field parsing states */
+ TOKENIZE_UNQUOTED,
+ TOKENIZE_UNQUOTED_WHITESPACE,
+ TOKENIZE_QUOTED,
+ /* Handling of two character control sequences (except "\r\n") */
+ TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE,
+ /* Line end handling */
+ TOKENIZE_LINE_END,
+ TOKENIZE_EAT_CRLF, /* "\r\n" support (carriage return, line feed) */
+ TOKENIZE_GOTO_LINE_END,
+} tokenizer_parsing_state;
+
+
+
+typedef struct {
+ size_t offset;
+ bool quoted;
+} field_info;
+
+
+typedef struct {
+ tokenizer_parsing_state state;
+ /* Either TOKENIZE_UNQUOTED or TOKENIZE_UNQUOTED_WHITESPACE: */
+ tokenizer_parsing_state unquoted_state;
+ int unicode_kind;
+ int buf_state;
+ size_t num_fields;
+ /* the buffer we are currently working on */
+ char *pos;
+ char *end;
+ /*
+ * Space to copy words into. The buffer must always be at least two NUL
+ * entries longer (8 bytes) than the actual word (including initially).
+ * The first byte beyond the current word is always NUL'ed on write, the
+ * second byte is there to allow easy appending of an additional empty
+ * word at the end (this word is also NUL terminated).
+ */
+ size_t field_buffer_length;
+ size_t field_buffer_pos;
+ Py_UCS4 *field_buffer;
+
+ /*
+ * Fields, including information about the field being quoted. This
+ * always includes one "additional" empty field. The length of a field
+ * is equal to `fields[i+1].offset - fields[i].offset - 1`.
+ *
+ * The tokenizer assumes at least one field is allocated.
+ */
+ field_info *fields;
+ size_t fields_size;
+} tokenizer_state;
+
+
+void
+tokenizer_clear(tokenizer_state *ts);
+
+
+int
+tokenizer_init(tokenizer_state *ts, parser_config *config);
+
+int
+tokenize(stream *s, tokenizer_state *ts, parser_config *const config);
+
+#endif
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index a6c2d4c2d..c2472f601 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -5,6 +5,7 @@ import itertools
import warnings
import weakref
import contextlib
+import operator
from operator import itemgetter, index as opindex, methodcaller
from collections.abc import Mapping
@@ -13,6 +14,7 @@ from . import format
from ._datasource import DataSource
from numpy.core import overrides
from numpy.core.multiarray import packbits, unpackbits
+from numpy.core._multiarray_umath import _load_from_filelike
from numpy.core.overrides import set_array_function_like_doc, set_module
from ._iotools import (
LineSplitter, NameValidator, StringConverter, ConverterError,
@@ -721,101 +723,6 @@ def _savez(file, args, kwds, compress, allow_pickle=True, pickle_kwargs=None):
zipf.close()
-def _floatconv(x):
- try:
- return float(x) # The fastest path.
- except ValueError:
- if '0x' in x: # Don't accidentally convert "a" ("0xa") to 10.
- try:
- return float.fromhex(x)
- except ValueError:
- pass
- raise # Raise the original exception, which makes more sense.
-
-
-_CONVERTERS = [ # These converters only ever get strs (not bytes) as input.
- (np.bool_, lambda x: bool(int(x))),
- (np.uint64, np.uint64),
- (np.int64, np.int64),
- (np.integer, lambda x: int(float(x))),
- (np.longdouble, np.longdouble),
- (np.floating, _floatconv),
- (complex, lambda x: complex(x.replace('+-', '-'))),
- (np.bytes_, methodcaller('encode', 'latin-1')),
- (np.unicode_, str),
-]
-
-
-def _getconv(dtype):
- """
- Find the correct dtype converter. Adapted from matplotlib.
-
- Even when a lambda is returned, it is defined at the toplevel, to allow
- testing for equality and enabling optimization for single-type data.
- """
- for base, conv in _CONVERTERS:
- if issubclass(dtype.type, base):
- return conv
- return str
-
-
-# _loadtxt_flatten_dtype_internal and _loadtxt_pack_items are loadtxt helpers
-# lifted to the toplevel because recursive inner functions cause either
-# GC-dependent reference loops (because they are closures over loadtxt's
-# internal variables) or large overheads if using a manual trampoline to hide
-# the recursive calls.
-
-
-# not to be confused with the flatten_dtype we import...
-def _loadtxt_flatten_dtype_internal(dt):
- """Unpack a structured data-type, and produce a packer function."""
- if dt.names is None:
- # If the dtype is flattened, return.
- # If the dtype has a shape, the dtype occurs
- # in the list more than once.
- shape = dt.shape
- if len(shape) == 0:
- return ([dt.base], None)
- else:
- packing = [(shape[-1], list)]
- if len(shape) > 1:
- for dim in dt.shape[-2::-1]:
- packing = [(dim*packing[0][0], packing*dim)]
- return ([dt.base] * int(np.prod(dt.shape)),
- functools.partial(_loadtxt_pack_items, packing))
- else:
- types = []
- packing = []
- for field in dt.names:
- tp, bytes = dt.fields[field]
- flat_dt, flat_packer = _loadtxt_flatten_dtype_internal(tp)
- types.extend(flat_dt)
- flat_packing = flat_packer.args[0] if flat_packer else None
- # Avoid extra nesting for subarrays
- if tp.ndim > 0:
- packing.extend(flat_packing)
- else:
- packing.append((len(flat_dt), flat_packing))
- return (types, functools.partial(_loadtxt_pack_items, packing))
-
-
-def _loadtxt_pack_items(packing, items):
- """Pack items into nested lists based on re-packing info."""
- if packing is None:
- return items[0]
- elif packing is tuple:
- return tuple(items)
- elif packing is list:
- return list(items)
- else:
- start = 0
- ret = []
- for length, subpacking in packing:
- ret.append(
- _loadtxt_pack_items(subpacking, items[start:start+length]))
- start += length
- return tuple(ret)
-
def _ensure_ndmin_ndarray_check_param(ndmin):
"""Just checks if the param ndmin is supported on
_ensure_ndmin_ndarray. Is intented to be used as
@@ -859,6 +766,310 @@ def _loadtxt_dispatcher(fname, dtype=None, comments=None, delimiter=None,
return (like,)
+def _check_nonneg_int(value, name="argument"):
+ try:
+ operator.index(value)
+ except TypeError:
+ raise TypeError(f"{name} must be an integer") from None
+ if value < 0:
+ raise ValueError(f"{name} must be nonnegative")
+
+
+def _preprocess_comments(iterable, comments, encoding):
+ """
+ Generator that consumes a line iterated iterable and strips out the
+ multiple (or multi-character) comments from lines.
+ This is a pre-processing step to achieve feature parity with loadtxt
+ (we assume that this feature is a nieche feature).
+ """
+ for line in iterable:
+ if isinstance(line, bytes):
+ # Need to handle conversion here, or the splitting would fail
+ line = line.decode(encoding)
+
+ for c in comments:
+ line = line.split(c, 1)[0]
+
+ yield line
+
+
+# The number of rows we read in one go if confronted with a parametric dtype
+_loadtxt_chunksize = 50000
+
+
+def _read(fname, *, delimiter=',', comment='#', quote='"',
+ imaginary_unit='j', usecols=None, skiprows=0,
+ max_rows=None, converters=None, ndmin=None, unpack=False,
+ dtype=np.float64, encoding="bytes"):
+ r"""
+ Read a NumPy array from a text file.
+
+ Parameters
+ ----------
+ fname : str or file object
+ The filename or the file to be read.
+ delimiter : str, optional
+ Field delimiter of the fields in line of the file.
+ Default is a comma, ','.
+ comment : str or sequence of str, optional
+ Character that begins a comment. All text from the comment
+ character to the end of the line is ignored.
+ Multiple comments or multiple-character comment strings are supported,
+ but may be slower and `quote` must be empty if used.
+ quote : str, optional
+ Character that is used to quote string fields. Default is '"'
+ (a double quote).
+ imaginary_unit : str, optional
+ Character that represent the imaginay unit `sqrt(-1)`.
+ Default is 'j'.
+ usecols : array_like, optional
+ A one-dimensional array of integer column numbers. These are the
+ columns from the file to be included in the array. If this value
+ is not given, all the columns are used.
+ skiprows : int, optional
+ Number of lines to skip before interpreting the data in the file.
+ max_rows : int, optional
+ Maximum number of rows of data to read. Default is to read the
+ entire file.
+ converters : dict, optional
+ A dictionary mapping column number to a function that will parse the
+ column string into the desired value. E.g. if column 0 is a date
+ string: ``converters = {0: datestr2num}``. Converters can also be used
+ to provide a default value for missing data, e.g.
+ ``converters = {3: lambda s: float(s.strip() or 0)}``.
+ Default: None
+ ndmin : int, optional
+ Minimum dimension of the array returned.
+ Allowed values are 0, 1 or 2. Default is 0.
+ unpack : bool, optional
+ If True, the returned array is transposed, so that arguments may be
+ unpacked using ``x, y, z = read(...)``. When used with a structured
+ data-type, arrays are returned for each field. Default is False.
+ dtype : numpy data type
+ A NumPy dtype instance, can be a structured dtype to map to the
+ columns of the file.
+ encoding : str, optional
+ Encoding used to decode the inputfile. The special value 'bytes'
+ (the default) enables backwards-compatible behavior for `converters`,
+ ensuring that inputs to the converter functions are encoded
+ bytes objects. The special value 'bytes' has no additional effect if
+ ``converters=None``. If encoding is ``'bytes'`` or ``None``, the
+ default system encoding is used.
+
+ Returns
+ -------
+ ndarray
+ NumPy array.
+
+ Examples
+ --------
+ First we create a file for the example.
+
+ >>> s1 = '1.0,2.0,3.0\n4.0,5.0,6.0\n'
+ >>> with open('example1.csv', 'w') as f:
+ ... f.write(s1)
+ >>> a1 = read_from_filename('example1.csv')
+ >>> a1
+ array([[1., 2., 3.],
+ [4., 5., 6.]])
+
+ The second example has columns with different data types, so a
+ one-dimensional array with a structured data type is returned.
+ The tab character is used as the field delimiter.
+
+ >>> s2 = '1.0\t10\talpha\n2.3\t25\tbeta\n4.5\t16\tgamma\n'
+ >>> with open('example2.tsv', 'w') as f:
+ ... f.write(s2)
+ >>> a2 = read_from_filename('example2.tsv', delimiter='\t')
+ >>> a2
+ array([(1. , 10, b'alpha'), (2.3, 25, b'beta'), (4.5, 16, b'gamma')],
+ dtype=[('f0', '<f8'), ('f1', 'u1'), ('f2', 'S5')])
+ """
+ # Handle special 'bytes' keyword for encoding
+ byte_converters = False
+ if encoding == 'bytes':
+ encoding = None
+ byte_converters = True
+
+ if dtype is None:
+ raise TypeError("a dtype must be provided.")
+ dtype = np.dtype(dtype)
+
+ read_dtype_via_object_chunks = None
+ if dtype.kind in 'SUM' and (
+ dtype == "S0" or dtype == "U0" or dtype == "M8" or dtype == 'm8'):
+ # This is a legacy "flexible" dtype. We do not truly support
+ # parametric dtypes currently (no dtype discovery step in the core),
+ # but have to support these for backward compatibility.
+ read_dtype_via_object_chunks = dtype
+ dtype = np.dtype(object)
+
+ if usecols is not None:
+ # Allow usecols to be a single int or a sequence of ints
+ try:
+ usecols_as_list = list(usecols)
+ except TypeError:
+ usecols_as_list = [usecols]
+ for col_idx in usecols_as_list:
+ try:
+ operator.index(col_idx)
+ except TypeError:
+ # Some unit tests for numpy.loadtxt require that the
+ # error message matches this format.
+ raise TypeError(
+ "usecols must be an int or a sequence of ints but "
+ "it contains at least one element of type %s" %
+ type(col_idx),
+ ) from None
+ # Fall back to existing code
+ usecols = np.array([operator.index(i) for i in usecols_as_list],
+ dtype=np.int32)
+
+ _ensure_ndmin_ndarray_check_param(ndmin)
+
+ if not isinstance(comment, str):
+ # assume comments are a sequence of strings
+ comments = tuple(comment)
+ comment = ''
+ # If there is only one comment, and that comment has one character,
+ # the normal parsing can deal with it just fine.
+ if len(comments) == 1:
+ if isinstance(comments[0], str) and len(comments[0]) == 1:
+ comment = comments[0]
+ comments = None
+ elif len(comment) > 1:
+ comments = (comment,)
+ comment = ''
+ else:
+ comments = None
+
+ # comment is now either a 1 or 0 character string or a tuple:
+ if comments is not None:
+ assert comment == ''
+ # Note: An earlier version support two character comments (and could
+ # have been extended to multiple characters, we assume this is
+ # rare enough to not optimize for.
+ if quote != "":
+ raise ValueError(
+ "when multiple comments or a multi-character comment is given, "
+ "quotes are not supported. In this case the quote character "
+ "must be set to the empty string: `quote=''`.")
+ else:
+ # No preprocessing necessary
+ assert comments is None
+
+ if len(imaginary_unit) != 1:
+ raise ValueError('len(imaginary_unit) must be 1.')
+
+ _check_nonneg_int(skiprows)
+ if max_rows is not None:
+ _check_nonneg_int(max_rows)
+ else:
+ # Passing -1 to the C code means "read the entire file".
+ max_rows = -1
+
+ fh_closing_ctx = contextlib.nullcontext()
+ filelike = False
+ try:
+ if isinstance(fname, os.PathLike):
+ fname = os.fspath(fname)
+ # TODO: loadtxt actually uses `file + ''` to decide this?!
+ if isinstance(fname, str):
+ fh = np.lib._datasource.open(fname, 'rt', encoding=encoding)
+ if encoding is None:
+ encoding = getattr(fh, 'encoding', 'latin1')
+
+ fh_closing_ctx = contextlib.closing(fh)
+ data = fh
+ filelike = True
+ else:
+ if encoding is None:
+ encoding = getattr(fname, 'encoding', 'latin1')
+ data = iter(fname)
+ except TypeError as e:
+ raise ValueError(
+ f"fname must be a string, filehandle, list of strings,\n"
+ f"or generator. Got {type(fname)} instead.") from e
+
+ with fh_closing_ctx:
+ if comments is not None:
+ if filelike:
+ data = iter(data)
+ filelike = False
+ data = _preprocess_comments(data, comments, encoding)
+
+ if read_dtype_via_object_chunks is None:
+ arr = _load_from_filelike(
+ data, delimiter=delimiter, comment=comment, quote=quote,
+ imaginary_unit=imaginary_unit,
+ usecols=usecols, skiprows=skiprows, max_rows=max_rows,
+ converters=converters, dtype=dtype,
+ encoding=encoding, filelike=filelike,
+ byte_converters=byte_converters)
+
+ else:
+ # This branch reads the file into chunks of object arrays and then
+ # casts them to the desired actual dtype. This ensures correct
+ # string-length and datetime-unit discovery (as for `arr.astype()`).
+ # Due to chunking, certain error reports are less clear, currently.
+ if filelike:
+ data = iter(data) # cannot chunk when reading from file
+
+ c_byte_converters = False
+ if read_dtype_via_object_chunks == "S":
+ c_byte_converters = True # Use latin1 rather than ascii
+
+ chunks = []
+ while max_rows != 0:
+ if max_rows < 0:
+ chunk_size = _loadtxt_chunksize
+ else:
+ chunk_size = min(_loadtxt_chunksize, max_rows)
+
+ next_arr = _load_from_filelike(
+ data, delimiter=delimiter, comment=comment, quote=quote,
+ imaginary_unit=imaginary_unit,
+ usecols=usecols, skiprows=skiprows, max_rows=max_rows,
+ converters=converters, dtype=dtype,
+ encoding=encoding, filelike=filelike,
+ byte_converters=byte_converters,
+ c_byte_converters=c_byte_converters)
+ # Cast here already. We hope that this is better even for
+ # large files because the storage is more compact. It could
+ # be adapted (in principle the concatenate could cast).
+ chunks.append(next_arr.astype(read_dtype_via_object_chunks))
+
+ skiprows = 0 # Only have to skip for first chunk
+ if max_rows >= 0:
+ max_rows -= chunk_size
+ if len(next_arr) < chunk_size:
+ # There was less data than requested, so we are done.
+ break
+
+ # Need at least one chunk, but if empty, the last one may have
+ # the wrong shape.
+ if len(chunks) > 1 and len(chunks[-1]) == 0:
+ del chunks[-1]
+ if len(chunks) == 1:
+ arr = chunks[0]
+ else:
+ arr = np.concatenate(chunks, axis=0)
+
+ arr = _ensure_ndmin_ndarray(arr, ndmin=ndmin)
+
+ if unpack:
+ # Handle unpack like np.loadtxt.
+ # XXX Check interaction with ndmin!
+ dt = arr.dtype
+ if dt.names is not None:
+ # For structured arrays, return an array for each field.
+ return [arr[field] for field in dt.names]
+ else:
+ return arr.T
+ else:
+ return arr
+
+
@set_array_function_like_doc
@set_module('numpy')
def loadtxt(fname, dtype=float, comments='#', delimiter=None,
@@ -1000,228 +1211,29 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
max_rows=max_rows, like=like
)
- # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- # Nested functions used by loadtxt.
- # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- def split_line(line: str):
- """Chop off comments, strip, and split at delimiter."""
- for comment in comments: # Much faster than using a single regex.
- line = line.split(comment, 1)[0]
- line = line.strip('\r\n')
- return line.split(delimiter) if line else []
+ if delimiter is None:
+ delimiter = ''
+ elif isinstance(delimiter, bytes):
+ delimiter.decode("latin1")
- # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- # Main body of loadtxt.
- # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- _ensure_ndmin_ndarray_check_param(ndmin)
+ if dtype is None:
+ dtype = np.float64
+ comment = comments
# Type conversions for Py3 convenience
- if comments is not None:
- if isinstance(comments, (str, bytes)):
- comments = [comments]
- comments = [_decode_line(x) for x in comments]
- else:
- comments = []
-
- if delimiter is not None:
- delimiter = _decode_line(delimiter)
-
- user_converters = converters
-
- byte_converters = False
- if encoding == 'bytes':
- encoding = None
- byte_converters = True
-
- if usecols is not None:
- # Copy usecols, allowing it to be a single int or a sequence of ints.
- try:
- usecols = list(usecols)
- except TypeError:
- usecols = [usecols]
- for i, col_idx in enumerate(usecols):
- try:
- usecols[i] = opindex(col_idx) # Cast to builtin int now.
- except TypeError as e:
- e.args = (
- "usecols must be an int or a sequence of ints but "
- "it contains at least one element of type %s" %
- type(col_idx),
- )
- raise
- if len(usecols) > 1:
- usecols_getter = itemgetter(*usecols)
- else:
- # Get an iterable back, even if using a single column.
- usecols_getter = lambda obj, c=usecols[0]: [obj[c]]
+ if comment is None:
+ comment = ''
else:
- usecols_getter = None
-
- # Make sure we're dealing with a proper dtype
- dtype = np.dtype(dtype)
- defconv = _getconv(dtype)
-
- dtype_types, packer = _loadtxt_flatten_dtype_internal(dtype)
-
- fh_closing_ctx = contextlib.nullcontext()
- try:
- if isinstance(fname, os_PathLike):
- fname = os_fspath(fname)
- if _is_string_like(fname):
- fh = np.lib._datasource.open(fname, 'rt', encoding=encoding)
- fencoding = getattr(fh, 'encoding', 'latin1')
- line_iter = iter(fh)
- fh_closing_ctx = contextlib.closing(fh)
- else:
- line_iter = iter(fname)
- fencoding = getattr(fname, 'encoding', 'latin1')
- try:
- first_line = next(line_iter)
- except StopIteration:
- pass # Nothing matters if line_iter is empty.
- else:
- # Put first_line back.
- line_iter = itertools.chain([first_line], line_iter)
- if isinstance(first_line, bytes):
- # Using latin1 matches _decode_line's behavior.
- decoder = methodcaller(
- "decode",
- encoding if encoding is not None else "latin1")
- line_iter = map(decoder, line_iter)
- except TypeError as e:
- raise ValueError(
- f"fname must be a string, filehandle, list of strings,\n"
- f"or generator. Got {type(fname)} instead."
- ) from e
-
- with fh_closing_ctx:
-
- # input may be a python2 io stream
- if encoding is not None:
- fencoding = encoding
- # we must assume local encoding
- # TODO emit portability warning?
- elif fencoding is None:
- import locale
- fencoding = locale.getpreferredencoding()
-
- # Skip the first `skiprows` lines
- for i in range(skiprows):
- next(line_iter)
-
- # Read until we find a line with some values, and use it to determine
- # the need for decoding and estimate the number of columns.
- for first_line in line_iter:
- ncols = len(usecols or split_line(first_line))
- if ncols:
- # Put first_line back.
- line_iter = itertools.chain([first_line], line_iter)
- break
- else: # End of lines reached
- ncols = len(usecols or [])
- warnings.warn('loadtxt: Empty input file: "%s"' % fname,
- stacklevel=2)
-
- line_iter = itertools.islice(line_iter, max_rows)
- lineno_words_iter = filter(
- itemgetter(1), # item[1] is words; filter skips empty lines.
- enumerate(map(split_line, line_iter), 1 + skiprows))
-
- # Now that we know ncols, create the default converters list, and
- # set packing, if necessary.
- if len(dtype_types) > 1:
- # We're dealing with a structured array, each field of
- # the dtype matches a column
- converters = [_getconv(dt) for dt in dtype_types]
- else:
- # All fields have the same dtype; use specialized packers which are
- # much faster than those using _loadtxt_pack_items.
- converters = [defconv for i in range(ncols)]
- if ncols == 1:
- packer = itemgetter(0)
- else:
- def packer(row): return row
-
- # By preference, use the converters specified by the user
- for i, conv in (user_converters or {}).items():
- if usecols:
- try:
- i = usecols.index(i)
- except ValueError:
- # Unused converter specified
- continue
- if byte_converters:
- # converters may use decode to workaround numpy's old
- # behaviour, so encode the string again (converters are only
- # called with strings) before passing to the user converter.
- def tobytes_first(conv, x):
- return conv(x.encode("latin1"))
- converters[i] = functools.partial(tobytes_first, conv)
- else:
- converters[i] = conv
-
- fencode = methodcaller("encode", fencoding)
- converters = [conv if conv is not bytes else fencode
- for conv in converters]
- if len(set(converters)) == 1:
- # Optimize single-type data. Note that this is only reached if
- # `_getconv` returns equal callables (i.e. not local lambdas) on
- # equal dtypes.
- def convert_row(vals, _conv=converters[0]):
- return [*map(_conv, vals)]
- else:
- def convert_row(vals):
- return [conv(val) for conv, val in zip(converters, vals)]
-
- # read data in chunks and fill it into an array via resize
- # over-allocating and shrinking the array later may be faster but is
- # probably not relevant compared to the cost of actually reading and
- # converting the data
- X = None
- while True:
- chunk = []
- for lineno, words in itertools.islice(
- lineno_words_iter, _loadtxt_chunksize):
- if usecols_getter is not None:
- words = usecols_getter(words)
- elif len(words) != ncols:
- raise ValueError(
- f"Wrong number of columns at line {lineno}")
- # Convert each value according to its column, then pack it
- # according to the dtype's nesting, and store it.
- chunk.append(packer(convert_row(words)))
- if not chunk: # The islice is empty, i.e. we're done.
- break
-
- if X is None:
- X = np.array(chunk, dtype)
- else:
- nshape = list(X.shape)
- pos = nshape[0]
- nshape[0] += len(chunk)
- X.resize(nshape, refcheck=False)
- X[pos:, ...] = chunk
-
- if X is None:
- X = np.array([], dtype)
+ if isinstance(comment, (str, bytes)):
+ comment = [comment]
+ comment = [x.decode('latin1') if isinstance(x, bytes) else x for x in comment]
- # Multicolumn data are returned with shape (1, N, M), i.e.
- # (1, 1, M) for a single row - remove the singleton dimension there
- if X.ndim == 3 and X.shape[:2] == (1, 1):
- X.shape = (1, -1)
+ arr = _read(fname, dtype=dtype, comment=comment, delimiter=delimiter,
+ converters=converters, skiprows=skiprows, usecols=usecols,
+ unpack=unpack, ndmin=ndmin, encoding=encoding,
+ max_rows=max_rows, quote='')
- X = _ensure_ndmin_ndarray(X, ndmin=ndmin)
-
- if unpack:
- if len(dtype_types) > 1:
- # For structured arrays, return an array for each field.
- return [X[field] for field in dtype.names]
- else:
- return X.T
- else:
- return X
+ return arr
_loadtxt_with_like = array_function_dispatch(