diff options
23 files changed, 2922 insertions, 311 deletions
diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 22cac1e9a..3d7e958d3 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -868,6 +868,7 @@ def configuration(parent_package='',top_path=None): join('src', 'multiarray', 'typeinfo.h'), join('src', 'multiarray', 'usertypes.h'), join('src', 'multiarray', 'vdot.h'), + join('src', 'multiarray', 'textreading', 'readtext.h'), join('include', 'numpy', 'arrayobject.h'), join('include', 'numpy', '_neighborhood_iterator_imp.h'), join('include', 'numpy', 'npy_endian.h'), @@ -955,6 +956,14 @@ def configuration(parent_package='',top_path=None): join('src', 'npysort', 'selection.c.src'), join('src', 'common', 'npy_binsearch.h'), join('src', 'npysort', 'binsearch.cpp'), + join('src', 'multiarray', 'textreading', 'conversions.c'), + join('src', 'multiarray', 'textreading', 'field_types.c'), + join('src', 'multiarray', 'textreading', 'growth.c'), + join('src', 'multiarray', 'textreading', 'readtext.c'), + join('src', 'multiarray', 'textreading', 'rows.c'), + join('src', 'multiarray', 'textreading', 'stream_pyobject.c'), + join('src', 'multiarray', 'textreading', 'str_to_int.c'), + join('src', 'multiarray', 'textreading', 'tokenize.c.src'), ] ####################################################################### diff --git a/numpy/core/src/multiarray/conversion_utils.c b/numpy/core/src/multiarray/conversion_utils.c index a1de580d9..e4eb4f49e 100644 --- a/numpy/core/src/multiarray/conversion_utils.c +++ b/numpy/core/src/multiarray/conversion_utils.c @@ -993,6 +993,17 @@ PyArray_PyIntAsIntp(PyObject *o) } +NPY_NO_EXPORT int +PyArray_IntpFromPyIntConverter(PyObject *o, npy_intp *val) +{ + *val = PyArray_PyIntAsIntp(o); + if (error_converting(*val)) { + return NPY_FAIL; + } + return NPY_SUCCEED; +} + + /* * PyArray_IntpFromIndexSequence * Returns the number of dimensions or -1 if an error occurred. diff --git a/numpy/core/src/multiarray/conversion_utils.h b/numpy/core/src/multiarray/conversion_utils.h index 4072841ee..4d0fbb894 100644 --- a/numpy/core/src/multiarray/conversion_utils.h +++ b/numpy/core/src/multiarray/conversion_utils.h @@ -7,6 +7,9 @@ NPY_NO_EXPORT int PyArray_IntpConverter(PyObject *obj, PyArray_Dims *seq); NPY_NO_EXPORT int +PyArray_IntpFromPyIntConverter(PyObject *o, npy_intp *val); + +NPY_NO_EXPORT int PyArray_OptionalIntpConverter(PyObject *obj, PyArray_Dims *seq); typedef enum { diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c index 789446d0c..a7b6898e1 100644 --- a/numpy/core/src/multiarray/multiarraymodule.c +++ b/numpy/core/src/multiarray/multiarraymodule.c @@ -69,6 +69,7 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0; #include "get_attr_string.h" #include "experimental_public_dtype_api.h" /* _get_experimental_dtype_api */ +#include "textreading/readtext.h" /* _readtext_from_file_object */ #include "npy_dlpack.h" @@ -4456,6 +4457,8 @@ static struct PyMethodDef array_module_methods[] = { METH_VARARGS | METH_KEYWORDS, NULL}, {"_get_experimental_dtype_api", (PyCFunction)_get_experimental_dtype_api, METH_O, NULL}, + {"_load_from_filelike", (PyCFunction)_load_from_filelike, + METH_FASTCALL | METH_KEYWORDS, NULL}, /* from umath */ {"frompyfunc", (PyCFunction) ufunc_frompyfunc, diff --git a/numpy/core/src/multiarray/textreading/conversions.c b/numpy/core/src/multiarray/textreading/conversions.c new file mode 100644 index 000000000..be697c380 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/conversions.c @@ -0,0 +1,375 @@ + +#include <Python.h> + +#include <string.h> +#include <stdlib.h> +#include <stdbool.h> + +#include "conversions.h" +#include "str_to_int.h" + +#include "array_coercion.h" + + +/* + * Coercion to boolean is done via integer right now. + */ +int +to_bool(PyArray_Descr *NPY_UNUSED(descr), + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *NPY_UNUSED(pconfig)) +{ + int64_t res; + if (str_to_int64(str, end, INT64_MIN, INT64_MAX, &res) < 0) { + return -1; + } + *dataptr = (char)(res != 0); + return 0; +} + + +/* + * In order to not pack a whole copy of a floating point parser, we copy the + * result into ascii and call the Python one. Float parsing isn't super quick + * so this is not terrible, but avoiding it would speed up things. + * + * Also note that parsing the first float of a complex will copy the whole + * string to ascii rather than just the first part. + * TODO: A tweak of the break might be a simple mitigation there. + * + * @param str The UCS4 string to parse + * @param end Pointer to the end of the string + * @param skip_trailing_whitespace If false does not skip trailing whitespace + * (used by the complex parser). + * @param result Output stored as double value. + */ +static NPY_INLINE int +double_from_ucs4( + const Py_UCS4 *str, const Py_UCS4 *end, + bool skip_trailing_whitespace, double *result, const Py_UCS4 **p_end) +{ + /* skip leading whitespace */ + while (Py_UNICODE_ISSPACE(*str)) { + str++; + } + if (str == end) { + return -1; /* empty or only whitespace: not a floating point number */ + } + + /* We convert to ASCII for the Python parser, use stack if small: */ + char stack_buf[128]; + char *heap_buf = NULL; + char *ascii = stack_buf; + + size_t str_len = end - str; + if (str_len > 128) { + heap_buf = PyMem_MALLOC(str_len); + ascii = heap_buf; + } + char *c = ascii; + for (; str < end; str++, c++) { + if (NPY_UNLIKELY(*str >= 128)) { + break; /* the following cannot be a number anymore */ + } + *c = (char)(*str); + } + *c = '\0'; + + char *end_parsed; + *result = PyOS_string_to_double(ascii, &end_parsed, NULL); + /* Rewind `end` to the first UCS4 character not parsed: */ + end = end - (c - end_parsed); + + PyMem_FREE(heap_buf); + + if (*result == -1. && PyErr_Occurred()) { + return -1; + } + + if (skip_trailing_whitespace) { + /* and then skip any remainig whitespace: */ + while (Py_UNICODE_ISSPACE(*end)) { + end++; + } + } + *p_end = end; + return 0; +} + +/* + * `item` must be the nul-terminated string that is to be + * converted to a double. + * + * To be successful, to_double() must use *all* the characters + * in `item`. E.g. "1.q25" will fail. Leading and trailing + * spaces are allowed. + */ +int +to_float(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *NPY_UNUSED(pconfig)) +{ + double double_val; + const Py_UCS4 *p_end; + if (double_from_ucs4(str, end, true, &double_val, &p_end) < 0) { + return -1; + } + if (p_end != end) { + return -1; + } + + float val = (float)double_val; + memcpy(dataptr, &val, sizeof(float)); + if (!PyArray_ISNBO(descr->byteorder)) { + descr->f->copyswap(dataptr, dataptr, 1, NULL); + } + return 0; +} + + +int +to_double(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *NPY_UNUSED(pconfig)) +{ + double val; + const Py_UCS4 *p_end; + if (double_from_ucs4(str, end, true, &val, &p_end) < 0) { + return -1; + } + if (p_end != end) { + return -1; + } + + memcpy(dataptr, &val, sizeof(double)); + if (!PyArray_ISNBO(descr->byteorder)) { + descr->f->copyswap(dataptr, dataptr, 1, NULL); + } + return 0; +} + + +static bool +to_complex_int( + const Py_UCS4 *item, const Py_UCS4 *token_end, + double *p_real, double *p_imag, + Py_UCS4 imaginary_unit, bool allow_parens) +{ + const Py_UCS4 *p_end; + bool unmatched_opening_paren = false; + + /* Remove whitespace before the possibly leading '(' */ + while (Py_UNICODE_ISSPACE(*item)) { + ++item; + } + if (allow_parens && (*item == '(')) { + unmatched_opening_paren = true; + ++item; + } + if (double_from_ucs4(item, token_end, false, p_real, &p_end) < 0) { + return false; + } + if (p_end == token_end) { + // No imaginary part in the string (e.g. "3.5") + *p_imag = 0.0; + return !unmatched_opening_paren; + } + if (*p_end == imaginary_unit) { + // Pure imaginary part only (e.g "1.5j") + *p_imag = *p_real; + *p_real = 0.0; + ++p_end; + if (unmatched_opening_paren && (*p_end == ')')) { + ++p_end; + unmatched_opening_paren = false; + } + } + else if (unmatched_opening_paren && (*p_end == ')')) { + *p_imag = 0.0; + ++p_end; + unmatched_opening_paren = false; + } + else { + if (*p_end == '+') { + ++p_end; + } + if (double_from_ucs4(p_end, token_end, false, p_imag, &p_end) < 0) { + return false; + } + if (*p_end != imaginary_unit) { + return false; + } + ++p_end; + if (unmatched_opening_paren && (*p_end == ')')) { + ++p_end; + unmatched_opening_paren = false; + } + } + while (Py_UNICODE_ISSPACE(*p_end)) { + ++p_end; + } + return p_end == token_end; +} + + +int +to_cfloat(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig) +{ + double real; + double imag; + + bool success = to_complex_int( + str, end, &real, &imag, + pconfig->imaginary_unit, true); + + if (!success) { + return -1; + } + npy_complex64 val = {(float)real, (float)imag}; + memcpy(dataptr, &val, sizeof(npy_complex64)); + if (!PyArray_ISNBO(descr->byteorder)) { + descr->f->copyswap(dataptr, dataptr, 1, NULL); + } + return 0; +} + + +int +to_cdouble(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig) +{ + double real; + double imag; + + bool success = to_complex_int( + str, end, &real, &imag, pconfig->imaginary_unit, true); + + if (!success) { + return -1; + } + npy_complex128 val = {real, imag}; + memcpy(dataptr, &val, sizeof(npy_complex128)); + if (!PyArray_ISNBO(descr->byteorder)) { + descr->f->copyswap(dataptr, dataptr, 1, NULL); + } + return 0; +} + + +/* + * String and unicode conversion functions. + */ +int +to_string(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *NPY_UNUSED(unused)) +{ + const Py_UCS4* c = str; + size_t length = descr->elsize; + + for (size_t i = 0; i < length; i++) { + if (c < end) { + /* + * loadtxt assumed latin1, which is compatible with UCS1 (first + * 256 unicode characters). + */ + if (NPY_UNLIKELY(*c > 255)) { + /* TODO: Was UnicodeDecodeError, is unspecific error good? */ + return -1; + } + dataptr[i] = (Py_UCS1)(*c); + c++; + } + else { + dataptr[i] = '\0'; + } + } + return 0; +} + + +int +to_unicode(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *NPY_UNUSED(unused)) +{ + size_t length = descr->elsize / 4; + + if (length <= (size_t)(end - str)) { + memcpy(dataptr, str, length * 4); + } + else { + size_t given_len = end - str; + memcpy(dataptr, str, given_len * 4); + memset(dataptr + given_len * 4, '\0', (length -given_len) * 4); + } + + if (!PyArray_ISNBO(descr->byteorder)) { + descr->f->copyswap(dataptr, dataptr, 1, NULL); + } + return 0; +} + + + +/* + * Convert functions helper for the generic converter. + */ +static PyObject * +call_converter_function( + PyObject *func, const Py_UCS4 *str, size_t length, bool byte_converters) +{ + PyObject *s = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, str, length); + if (s == NULL) { + return s; + } + if (byte_converters) { + Py_SETREF(s, PyUnicode_AsEncodedString(s, "latin1", NULL)); + if (s == NULL) { + return NULL; + } + } + if (func == NULL) { + return s; + } + PyObject *result = PyObject_CallFunctionObjArgs(func, s, NULL); + Py_DECREF(s); + return result; +} + + +int +to_generic_with_converter(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *config, PyObject *func) +{ + bool use_byte_converter; + if (func == NULL) { + use_byte_converter = config->c_byte_converters; + } + else { + use_byte_converter = config->python_byte_converters; + } + /* Converts to unicode and calls custom converter (if set) */ + PyObject *converted = call_converter_function( + func, str, (size_t)(end - str), use_byte_converter); + if (converted == NULL) { + return -1; + } + + int res = PyArray_Pack(descr, dataptr, converted); + Py_DECREF(converted); + return res; +} + + +int +to_generic(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *config) +{ + return to_generic_with_converter(descr, str, end, dataptr, config, NULL); +}
\ No newline at end of file diff --git a/numpy/core/src/multiarray/textreading/conversions.h b/numpy/core/src/multiarray/textreading/conversions.h new file mode 100644 index 000000000..6308c10d4 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/conversions.h @@ -0,0 +1,57 @@ +#ifndef CONVERSIONS_H +#define CONVERSIONS_H + +#include <stdbool.h> + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/arrayobject.h" + +#include "textreading/parser_config.h" + +int +to_bool(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig); + +int +to_float(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig); + +int +to_double(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig); + +int +to_cfloat(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig); + +int +to_cdouble(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig); + +int +to_string(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *unused); + +int +to_unicode(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *unused); + +int +to_generic_with_converter(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *unused, PyObject *func); + +int +to_generic(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig); + +#endif diff --git a/numpy/core/src/multiarray/textreading/field_types.c b/numpy/core/src/multiarray/textreading/field_types.c new file mode 100644 index 000000000..914c8e4d8 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/field_types.c @@ -0,0 +1,200 @@ +#include "field_types.h" +#include "conversions.h" +#include "str_to_int.h" + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/ndarraytypes.h" +#include "alloc.h" + +#include "textreading/growth.h" + + +void +field_types_xclear(int num_field_types, field_type *ft) { + assert(num_field_types >= 0); + if (ft == NULL) { + return; + } + for (int i = 0; i < num_field_types; i++) { + Py_XDECREF(ft[i].descr); + ft[i].descr = NULL; + } + PyMem_Free(ft); +} + + +/* + * Fetch custom converters for the builtin NumPy DTypes (or the generic one). + * Structured DTypes get unpacked and `object` uses the generic method. + * + * TODO: This should probably be moved on the DType object in some form, + * to allow user DTypes to define their own converters. + */ +static set_from_ucs4_function * +get_from_ucs4_function(PyArray_Descr *descr) +{ + if (descr->type_num == NPY_BOOL) { + return &to_bool; + } + else if (PyDataType_ISSIGNED(descr)) { + switch (descr->elsize) { + case 1: + return &to_int8; + case 2: + return &to_int16; + case 4: + return &to_int32; + case 8: + return &to_int64; + default: + assert(0); + } + } + else if (PyDataType_ISUNSIGNED(descr)) { + switch (descr->elsize) { + case 1: + return &to_uint8; + case 2: + return &to_uint16; + case 4: + return &to_uint32; + case 8: + return &to_uint64; + default: + assert(0); + } + } + else if (descr->type_num == NPY_FLOAT) { + return &to_float; + } + else if (descr->type_num == NPY_DOUBLE) { + return &to_double; + } + else if (descr->type_num == NPY_CFLOAT) { + return &to_cfloat; + } + else if (descr->type_num == NPY_CDOUBLE) { + return &to_cdouble; + } + else if (descr->type_num == NPY_STRING) { + return &to_string; + } + else if (descr->type_num == NPY_UNICODE) { + return &to_unicode; + } + return &to_generic; +} + + +/* + * Note that the function cleans up `ft` on error. If `num_field_types < 0` + * cleanup has already happened in the internal call. + */ +static npy_intp +field_type_grow_recursive(PyArray_Descr *descr, + npy_intp num_field_types, field_type **ft, npy_intp *ft_size, + npy_intp field_offset) +{ + if (PyDataType_HASSUBARRAY(descr)) { + PyArray_Dims shape = {NULL, -1}; + + if (!(PyArray_IntpConverter(descr->subarray->shape, &shape))) { + PyErr_SetString(PyExc_ValueError, "invalid subarray shape"); + field_types_xclear(num_field_types, *ft); + return -1; + } + npy_intp size = PyArray_MultiplyList(shape.ptr, shape.len); + npy_free_cache_dim_obj(shape); + for (npy_intp i = 0; i < size; i++) { + num_field_types = field_type_grow_recursive(descr->subarray->base, + num_field_types, ft, ft_size, field_offset); + field_offset += descr->subarray->base->elsize; + if (num_field_types < 0) { + return -1; + } + } + return num_field_types; + } + else if (PyDataType_HASFIELDS(descr)) { + npy_int num_descr_fields = PyTuple_Size(descr->names); + if (num_descr_fields < 0) { + field_types_xclear(num_field_types, *ft); + return -1; + } + for (npy_intp i = 0; i < num_descr_fields; i++) { + PyObject *key = PyTuple_GET_ITEM(descr->names, i); + PyObject *tup = PyObject_GetItem(descr->fields, key); + if (tup == NULL) { + field_types_xclear(num_field_types, *ft); + return -1; + } + PyArray_Descr *field_descr; + PyObject *title; + int offset; + if (!PyArg_ParseTuple(tup, "Oi|O", &field_descr, &offset, &title)) { + Py_DECREF(tup); + field_types_xclear(num_field_types, *ft); + return -1; + } + num_field_types = field_type_grow_recursive( + field_descr, num_field_types, ft, ft_size, + field_offset + offset); + if (num_field_types < 0) { + return -1; + } + } + return num_field_types; + } + + if (*ft_size <= num_field_types) { + npy_intp alloc_size = grow_size_and_multiply( + ft_size, 4, sizeof(field_type)); + if (alloc_size < 0) { + field_types_xclear(num_field_types, *ft); + return -1; + } + field_type *new_ft = PyMem_Realloc(*ft, alloc_size); + if (new_ft == NULL) { + field_types_xclear(num_field_types, *ft); + return -1; + } + *ft = new_ft; + } + + Py_INCREF(descr); + (*ft)[num_field_types].descr = descr; + (*ft)[num_field_types].set_from_ucs4 = get_from_ucs4_function(descr); + (*ft)[num_field_types].structured_offset = field_offset; + + return num_field_types + 1; +} + + +/* + * Prepare the "field_types" for the given dtypes/descriptors. Currently, + * we copy the itemsize, but the main thing is that we check for custom + * converters. + */ +npy_intp +field_types_create(PyArray_Descr *descr, field_type **ft) +{ + if (descr->subarray != NULL) { + /* + * This could probably be allowed, but NumPy absorbs the dimensions + * so it is an awkward corner case that probably never really worked. + */ + PyErr_SetString(PyExc_TypeError, + "file reader does not support subarray dtypes. You can" + "put the dtype into a structured one using " + "`np.dtype(('name', dtype))` to avoid this limitation."); + return -1; + } + + npy_intp ft_size = 4; + *ft = PyMem_Malloc(ft_size * sizeof(field_type)); + if (*ft == NULL) { + return -1; + } + return field_type_grow_recursive(descr, 0, ft, &ft_size, 0); +} diff --git a/numpy/core/src/multiarray/textreading/field_types.h b/numpy/core/src/multiarray/textreading/field_types.h new file mode 100644 index 000000000..5c4cfb2c6 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/field_types.h @@ -0,0 +1,49 @@ + +#ifndef _FIELD_TYPES_H_ +#define _FIELD_TYPES_H_ + +#include <stdint.h> +#include <stdbool.h> +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/ndarraytypes.h" + +#include "textreading/parser_config.h" + +/* + * The original code had some error details, but I assume that we don't need + * it. Printing the string from which we tried to modify it should be fine. + * This should potentially be public NumPy API, although it is tricky, NumPy + * + * This function must support unaligned memory access. + * + * NOTE: An earlier version of the code had unused default versions (pandas + * does this) when columns are missing. We could define this either + * by passing `NULL` in, or by adding a default explicitly somewhere. + * (I think users should probably have to define the default, at which + * point it doesn't matter here.) + * + * NOTE: We are currently passing the parser config, this could be made public + * or could be set up to be dtype specific/private. Always passing + * pconfig fully seems easier right now even if it may change. + */ +typedef int (set_from_ucs4_function)( + PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, + char *dataptr, parser_config *pconfig); + +typedef struct _field_type { + set_from_ucs4_function *set_from_ucs4; + /* The original NumPy descriptor */ + PyArray_Descr *descr; + /* Offset to this entry within row. */ + npy_intp structured_offset; +} field_type; + + +void +field_types_xclear(int num_field_types, field_type *ft); + +npy_intp +field_types_create(PyArray_Descr *descr, field_type **ft); + +#endif diff --git a/numpy/core/src/multiarray/textreading/growth.c b/numpy/core/src/multiarray/textreading/growth.c new file mode 100644 index 000000000..a38c6d5aa --- /dev/null +++ b/numpy/core/src/multiarray/textreading/growth.c @@ -0,0 +1,38 @@ +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "templ_common.h" + +#include "textreading/growth.h" + + +/* + * Helper function taking the size input and growing it (based on min_grow). + * It further multiplies it with `itemsize` and ensures that all results fit + * into an `npy_intp`. + * Returns -1 if any overflow occurred or the result would not fit. + * The user has to ensure the input is size_t (i.e. unsigned). + */ +npy_intp +grow_size_and_multiply(size_t *size, size_t min_grow, npy_intp itemsize) { + /* min_grow must be a power of two: */ + assert((min_grow & (min_grow - 1)) == 0); + size_t growth = *size >> 2; + if (growth <= min_grow) { + *size += min_grow; + } + else { + *size += growth + min_grow - 1; + *size &= ~min_grow; + + if (*size > NPY_MAX_INTP) { + return -1; + } + } + + npy_intp res; + if (npy_mul_with_overflow_intp(&res, (npy_intp)*size, itemsize)) { + return -1; + } + return res; +} + diff --git a/numpy/core/src/multiarray/textreading/growth.h b/numpy/core/src/multiarray/textreading/growth.h new file mode 100644 index 000000000..debe9a7b3 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/growth.h @@ -0,0 +1,7 @@ +#ifndef _NPY_GROWTH_H +#define _NPY_GROWTH_H + +npy_intp +grow_size_and_multiply(size_t *size, size_t min_grow, npy_intp itemsize); + +#endif /*_NPY_GROWTH_H */ diff --git a/numpy/core/src/multiarray/textreading/parser_config.h b/numpy/core/src/multiarray/textreading/parser_config.h new file mode 100644 index 000000000..c60565de1 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/parser_config.h @@ -0,0 +1,77 @@ + +#ifndef _PARSER_CONFIG_H_ +#define _PARSER_CONFIG_H_ + +#include <stdbool.h> + +typedef struct { + /* + * Field delimiter character. + * Typically ',', ' ', '\t', ignored if `delimiter_is_whitespace` is true. + */ + Py_UCS4 delimiter; + + /* + * Character used to quote fields. + * Typically '"' or "'". To disable quoting we set this to UINT_MAX + * (which is not a valid unicode character and thus cannot occur in the + * file; the same is used for all other characters if necessary). + */ + Py_UCS4 quote; + + /* + * Character(s) that indicates the start of a comment. + * Typically '#', '%' or ';'. + * When encountered in a line and not inside quotes, all character + * from the comment character(s) to the end of the line are ignored. + */ + Py_UCS4 comment; + + /* + * Ignore whitespace at the beginning of a field (outside/before quotes). + * Is (and must be) set if `delimiter_is_whitespace`. + */ + bool ignore_leading_whitespace; + + /* + * If true, the delimiter is ignored and any unicode whitespace is used + * for splitting (same as `string.split()` in Python). In that case + * `ignore_leading_whitespace` should also be set. + */ + bool delimiter_is_whitespace; + + /* + * A boolean value (0 or 1). If 1, quoted fields may span + * more than one line. For example, the following + * 100, 200, "FOO + * BAR" + * is one "row", containing three fields: 100, 200 and "FOO\nBAR". + * If 0, the parser considers an unclosed quote to be an error. (XXX Check!) + */ + bool allow_embedded_newline; + + /* + * The imaginary unit character. Default is `j`. + */ + Py_UCS4 imaginary_unit; + + /* + * If true, when an integer dtype is given, the field is allowed + * to contain a floating point value. It will be cast to the + * integer type. + */ + bool allow_float_for_int; + /* + * Data should be encoded as `latin1` when using python converter + * (implementing `loadtxt` default Python 2 compatibility mode). + * The c byte converter is used when the user requested `dtype="S"`. + * In this case we go via `dtype=object`, however, loadtxt allows latin1 + * while normal object to string casts only accept ASCII, so it ensures + * that that the object array already contains bytes and not strings. + */ + bool python_byte_converters; + bool c_byte_converters; +} parser_config; + + +#endif diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c new file mode 100644 index 000000000..750e77b2d --- /dev/null +++ b/numpy/core/src/multiarray/textreading/readtext.c @@ -0,0 +1,199 @@ +#include <stdio.h> +#include <stdbool.h> + +#define PY_SSIZE_T_CLEAN +#include <Python.h> + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/arrayobject.h" +#include "npy_argparse.h" +#include "conversion_utils.h" + +#include "textreading/parser_config.h" +#include "textreading/stream_pyobject.h" +#include "textreading/field_types.h" +#include "textreading/rows.h" +#include "textreading/str_to_int.h" + + +// +// `usecols` must point to a Python object that is Py_None or a 1-d contiguous +// numpy array with data type int32. +// +// `dtype` must point to a Python object that is Py_None or a numpy dtype +// instance. If the latter, code and sizes must be arrays of length +// num_dtype_fields, holding the flattened data field type codes and byte +// sizes. (num_dtype_fields, codes, and sizes can be inferred from dtype, +// but we do that in Python code.) +// +// If both `usecols` and `dtype` are not None, and the data type is compound, +// then len(usecols) must equal num_dtype_fields. +// +// If `dtype` is given and it is compound, and `usecols` is None, then the +// number of columns in the file must match the number of fields in `dtype`. +// +static PyObject * +_readtext_from_stream(stream *s, parser_config *pc, + PyObject *usecols, Py_ssize_t skiprows, Py_ssize_t max_rows, + PyObject *converters, PyObject *dtype) +{ + PyArrayObject *arr = NULL; + PyArray_Descr *out_dtype = NULL; + int32_t *cols; + int ncols; + field_type *ft = NULL; + + /* TODO: Find better solution maybe? */ + if (double_descr == NULL) { + double_descr = PyArray_DescrFromType(NPY_DOUBLE); + } + + /* + * If dtypes[0] is dtype the input was not structured and the result + * is considered "homogeneous" and we have to discover the number of + * columns/ + */ + out_dtype = (PyArray_Descr *)dtype; + Py_INCREF(out_dtype); + + npy_intp num_fields = field_types_create(out_dtype, &ft); + if (num_fields < 0) { + goto finish; + } + bool homogeneous = num_fields == 1 && ft[0].descr == out_dtype; + + if (usecols == Py_None) { + ncols = num_fields; + cols = NULL; + } + else { + ncols = PyArray_SIZE((PyArrayObject *)usecols); + cols = PyArray_DATA((PyArrayObject *)usecols); + } + + arr = read_rows( + s, max_rows, num_fields, ft, pc, + ncols, cols, skiprows, converters, + NULL, out_dtype, homogeneous); + if (arr == NULL) { + goto finish; + } + + finish: + Py_XDECREF(out_dtype); + field_types_xclear(num_fields, ft); + return (PyObject *)arr; +} + + +static int +parse_control_character(PyObject *obj, Py_UCS4 *character) +{ + if (!PyUnicode_Check(obj) || PyUnicode_GetLength(obj) > 1) { + PyErr_Format(PyExc_TypeError, + "Control character must be a single unicode character or " + "empty unicode string; but got: %.100R", obj); + return 0; + } + if (PyUnicode_GET_LENGTH(obj) == 0) { + *character = (Py_UCS4)-1; /* character beyond unicode range */ + return 1; + } + *character = PyUnicode_READ_CHAR(obj, 0); + return 1; +} + + +NPY_NO_EXPORT PyObject * +_load_from_filelike(PyObject *NPY_UNUSED(mod), + PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames) +{ + PyObject *file; + Py_ssize_t skiprows = 0; + Py_ssize_t max_rows = -1; + PyObject *usecols = Py_None; + PyObject *converters = Py_None; + + PyObject *dtype = Py_None; + PyObject *encoding_obj = Py_None; + const char *encoding = NULL; + + parser_config pc = { + .delimiter = ',', + .comment = '#', + .quote = '"', + .imaginary_unit = 'j', + .allow_float_for_int = true, + .allow_embedded_newline = true, + .delimiter_is_whitespace = false, + .ignore_leading_whitespace = false, + .python_byte_converters = false, + .c_byte_converters = false, + }; + bool filelike = true; + + PyObject *arr = NULL; + + NPY_PREPARE_ARGPARSER; + if (npy_parse_arguments("_load_from_filelike", args, len_args, kwnames, + "file", NULL, &file, + "|delimiter", &parse_control_character, &pc.delimiter, + "|comment", &parse_control_character, &pc.comment, + "|quote", &parse_control_character, &pc.quote, + "|imaginary_unit", &parse_control_character, &pc.imaginary_unit, + "|usecols", NULL, &usecols, + "|skiprows", &PyArray_IntpFromPyIntConverter, &skiprows, + "|max_rows", &PyArray_IntpFromPyIntConverter, &max_rows, + "|converters", NULL, &converters, + "|dtype", NULL, &dtype, + "|encoding", NULL, &encoding_obj, + "|filelike", &PyArray_BoolConverter, &filelike, + "|byte_converters", &PyArray_BoolConverter, &pc.python_byte_converters, + "|c_byte_converters", PyArray_BoolConverter, &pc.c_byte_converters, + NULL, NULL, NULL) < 0) { + return NULL; + } + + if (pc.delimiter == (Py_UCS4)-1) { + pc.delimiter_is_whitespace = true; + /* Ignore leading whitespace to match `string.split(None)` */ + pc.ignore_leading_whitespace = true; + } + + if (!PyArray_DescrCheck(dtype) ) { + PyErr_SetString(PyExc_TypeError, + "internal error: dtype must be provided and be a NumPy dtype"); + return NULL; + } + + if (encoding_obj != Py_None) { + if (!PyUnicode_Check(encoding_obj)) { + PyErr_SetString(PyExc_TypeError, + "encoding must be a unicode string."); + return NULL; + } + encoding = PyUnicode_AsUTF8(encoding_obj); + if (encoding == NULL) { + return NULL; + } + } + + stream *s; + if (filelike) { + s = stream_python_file(file, encoding); + } + else { + s = stream_python_iterable(file, encoding); + } + if (s == NULL) { + PyErr_Format(PyExc_RuntimeError, "Unable to access the file."); + return NULL; + } + + arr = _readtext_from_stream(s, &pc, usecols, skiprows, max_rows, + converters, dtype); + stream_close(s); + return arr; +} + diff --git a/numpy/core/src/multiarray/textreading/readtext.h b/numpy/core/src/multiarray/textreading/readtext.h new file mode 100644 index 000000000..8c4707368 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/readtext.h @@ -0,0 +1,7 @@ +#ifndef READTEXT_H_ +#define READTEXT_H_ + +NPY_NO_EXPORT PyObject * +_load_from_filelike(PyObject *self, PyObject *args, PyObject *kwargs); + +#endif /* READTEXT_H_ */ diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c new file mode 100644 index 000000000..9301abd5c --- /dev/null +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -0,0 +1,438 @@ + +#define PY_SSIZE_T_CLEAN +#include <Python.h> + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/arrayobject.h" +#include "numpy/npy_3kcompat.h" + +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <time.h> +#include <math.h> +#include <stdbool.h> + +#include "textreading/stream.h" +#include "textreading/tokenize.h" +#include "textreading/conversions.h" +#include "textreading/field_types.h" +#include "textreading/rows.h" +#include "textreading/growth.h" + +/* + * Minimum size to grow the allcoation by (or 25%). The 8KiB means the actual + * growths is within `8 KiB <= size < 16 KiB` (depending on the row size). + */ +#define MIN_BLOCK_SIZE (1 << 13) + + + +/* + * Create the array of converter functions from the Python converters. + */ +PyObject ** +create_conv_funcs( + PyObject *converters, int num_fields, int32_t *usecols) +{ + PyObject **conv_funcs = PyMem_Calloc(num_fields, sizeof(PyObject *)); + if (conv_funcs == NULL) { + PyErr_NoMemory(); + return NULL; + } + if (converters == Py_None) { + return conv_funcs; + } + else if (!PyDict_Check(converters)) { + PyErr_SetString(PyExc_TypeError, + "converters must be a dictionary mapping columns to converter " + "functions."); + return NULL; + } + + PyObject *key, *value; + Py_ssize_t pos = 0; + while (PyDict_Next(converters, &pos, &key, &value)) { + Py_ssize_t column = PyNumber_AsSsize_t(key, PyExc_IndexError); + if (column == -1 && PyErr_Occurred()) { + PyErr_Format(PyExc_TypeError, + "keys of the converters dictionary must be integers; " + "got %.100R", key); + goto error; + } + if (usecols != NULL) { + /* + * This code searches for the corresponding usecol. It is + * identical to the legacy usecols code, which has two weaknesses: + * 1. It fails for duplicated usecols only setting converter for + * the first one. + * 2. It fails e.g. if usecols uses negative indexing and + * converters does not. (This is a feature, since it allows + * us to correctly normalize converters to result column here.) + */ + int i = 0; + for (; i < num_fields; i++) { + if (column == usecols[i]) { + column = i; + break; + } + } + if (i == num_fields) { + continue; /* ignore unused converter */ + } + } + else { + if (column < -num_fields || column >= num_fields) { + PyErr_Format(PyExc_ValueError, + "converter specified for column %zd, which is invalid " + "for the number of fields %d.", column, num_fields); + goto error; + } + if (column < 0) { + column += num_fields; + } + } + if (!PyCallable_Check(value)) { + PyErr_Format(PyExc_TypeError, + "values of the converters dictionary must be callable, " + "but the value associated with key %R is not", key); + goto error; + } + Py_INCREF(value); + conv_funcs[column] = value; + } + return conv_funcs; + + error: + for (int i = 0; i < num_fields; i++) { + Py_XDECREF(conv_funcs[i]); + } + PyMem_FREE(conv_funcs); + return NULL; +} + +/** + * Read a file into the provided array, or create (and possibly grow) an + * array to read into. + * + * @param s The stream object/struct providing reading capabilities used by + * the tokenizer. + * @param max_rows The number of rows to read, or -1. If negative + * all rows are read. + * @param num_field_types The number of field types stored in `field_types`. + * @param field_types Information about the dtype for each column (or one if + * `homogeneous`). + * @param pconfig Pointer to the parser config object used by both the + * tokenizer and the conversion functions. + * @param num_usecols The number of columns in `usecols`. + * @param usecols An array of length `num_usecols` or NULL. If given indicates + * which column is read for each individual row (negative columns are + * accepted). + * @param skiplines The number of lines to skip, these lines are ignored. + * @param converters Python dictionary of converters. Finalizing converters + * is difficult without information about the number of columns. + * @param data_array An array to be filled or NULL. In either case a new + * reference is returned (the reference to `data_array` is not stolen). + * @param out_descr The dtype used for allocating a new array. This is not + * used if `data_array` is provided. Note that the actual dtype of the + * returned array can differ for strings. + * @param num_cols Pointer in which the actual (discovered) number of columns + * is returned. This is only relevant if `homogeneous` is true. + * @param homogeneous Whether the datatype of the array is not homogeneous, + * i.e. not structured. In this case the number of columns has to be + * discovered an the returned array will be 2-dimensional rather than + * 1-dimensional. + * + * @returns Returns the result as an array object or NULL on error. The result + * is always a new reference (even when `data_array` was passed in). + */ +PyArrayObject * +read_rows(stream *s, + npy_intp max_rows, int num_field_types, field_type *field_types, + parser_config *pconfig, int num_usecols, int *usecols, + Py_ssize_t skiplines, PyObject *converters, + PyArrayObject *data_array, PyArray_Descr *out_descr, + bool homogeneous) +{ + char *data_ptr = NULL; + int current_num_fields; + size_t row_size = out_descr->elsize; + PyObject **conv_funcs = NULL; + + bool needs_init = PyDataType_FLAGCHK(out_descr, NPY_NEEDS_INIT); + + int ndim = homogeneous ? 2 : 1; + npy_intp result_shape[2] = {0, 1}; + + bool data_array_allocated = data_array == NULL; + /* Make sure we own `data_array` for the purpose of error handling */ + Py_XINCREF(data_array); + size_t rows_per_block = 1; /* will be increased depending on row size */ + Py_ssize_t data_allocated_rows = 0; + + int ts_result = 0; + tokenizer_state ts; + if (tokenizer_init(&ts, pconfig) < 0) { + goto error; + } + + /* Set the actual number of fields if it is already known, otherwise -1 */ + int actual_num_fields = -1; + if (usecols != NULL) { + actual_num_fields = num_usecols; + } + else if (!homogeneous) { + actual_num_fields = num_field_types; + } + + for (; skiplines > 0; skiplines--) { + ts.state = TOKENIZE_GOTO_LINE_END; + ts_result = tokenize(s, &ts, pconfig); + if (ts_result < 0) { + goto error; + } + else if (ts_result != 0) { + /* Fewer lines than skiplines is acceptable */ + break; + } + } + + Py_ssize_t row_count = 0; /* number of rows actually processed */ + while ((max_rows < 0 || row_count < max_rows) && ts_result == 0) { + ts_result = tokenize(s, &ts, pconfig); + if (ts_result < 0) { + goto error; + } + current_num_fields = ts.num_fields; + field_info *fields = ts.fields; + if (ts.num_fields == 0) { + continue; /* Ignore empty line */ + } + + if (NPY_UNLIKELY(data_ptr == NULL)) { + // We've deferred some of the initialization tasks to here, + // because we've now read the first line, and we definitively + // know how many fields (i.e. columns) we will be processing. + if (actual_num_fields == -1) { + actual_num_fields = current_num_fields; + } + + conv_funcs = create_conv_funcs( + converters, actual_num_fields, usecols); + if (conv_funcs == NULL) { + goto error; + } + + /* Note that result_shape[1] is only used if homogeneous is true */ + result_shape[1] = actual_num_fields; + if (homogeneous) { + row_size *= actual_num_fields; + } + + if (data_array == NULL) { + if (max_rows < 0) { + /* + * Negative max_rows denotes to read the whole file, we + * approach this by allocating ever larger blocks. + * Adds a number of rows based on `MIN_BLOCK_SIZE`. + * Note: later code grows assuming this is a power of two. + */ + if (row_size == 0) { + /* actual rows_per_block should not matter here */ + rows_per_block = 512; + } + else { + /* safe on overflow since min_rows will be 0 or 1 */ + size_t min_rows = ( + (MIN_BLOCK_SIZE + row_size - 1) / row_size); + while (rows_per_block < min_rows) { + rows_per_block *= 2; + } + } + data_allocated_rows = rows_per_block; + } + else { + data_allocated_rows = max_rows; + } + result_shape[0] = data_allocated_rows; + Py_INCREF(out_descr); + /* + * We do not use Empty, as it would fill with None + * and requiring decref'ing if we shrink again. + */ + data_array = (PyArrayObject *)PyArray_SimpleNewFromDescr( + ndim, result_shape, out_descr); + if (data_array == NULL) { + goto error; + } + if (needs_init) { + memset(PyArray_BYTES(data_array), 0, PyArray_NBYTES(data_array)); + } + } + else { + assert(max_rows >=0); + data_allocated_rows = max_rows; + } + data_ptr = PyArray_BYTES(data_array); + } + + if (!usecols && (actual_num_fields != current_num_fields)) { + PyErr_Format(PyExc_ValueError, + "the number of columns changed from %d to %d at row %zu; " + "use `usecols` to select a subset and avoid this error", + actual_num_fields, current_num_fields, row_count+1); + goto error; + } + + if (NPY_UNLIKELY(data_allocated_rows == row_count)) { + /* + * Grow by ~25% and rounded up to the next rows_per_block + * NOTE: This is based on very crude timings and could be refined! + */ + size_t new_rows = data_allocated_rows; + npy_intp alloc_size = grow_size_and_multiply( + &new_rows, rows_per_block, row_size); + if (alloc_size < 0) { + /* should normally error much earlier, but make sure */ + PyErr_SetString(PyExc_ValueError, + "array is too big. Cannot read file as a single array; " + "providing a maximum number of rows to read may help."); + goto error; + } + + char *new_data = PyDataMem_RENEW( + PyArray_BYTES(data_array), alloc_size ? alloc_size : 1); + if (new_data == NULL) { + PyErr_NoMemory(); + goto error; + } + /* Replace the arrays data since it may have changed */ + ((PyArrayObject_fields *)data_array)->data = new_data; + ((PyArrayObject_fields *)data_array)->dimensions[0] = new_rows; + data_ptr = new_data + row_count * row_size; + data_allocated_rows = new_rows; + if (needs_init) { + memset(data_ptr, '\0', (new_rows - row_count) * row_size); + } + } + + for (int i = 0; i < actual_num_fields; ++i) { + int f; /* The field, either 0 (if homogeneous) or i. */ + int col; /* The column as read, remapped by usecols */ + char *item_ptr; + if (homogeneous) { + f = 0; + item_ptr = data_ptr + i * field_types[0].descr->elsize; + } + else { + f = i; + item_ptr = data_ptr + field_types[f].structured_offset; + } + + if (usecols == NULL) { + col = i; + } + else { + col = usecols[i]; + if (col < 0) { + // Python-like column indexing: k = -1 means the last column. + col += current_num_fields; + } + if (NPY_UNLIKELY((col < 0) || (col >= current_num_fields))) { + PyErr_Format(PyExc_ValueError, + "invalid column index %d at row %zu with %d " + "columns", + usecols[i], current_num_fields, row_count+1); + goto error; + } + } + + bool err = 0; + Py_UCS4 *str = ts.field_buffer + fields[col].offset; + Py_UCS4 *end = ts.field_buffer + fields[col + 1].offset - 1; + if (conv_funcs[i] == NULL) { + if (field_types[f].set_from_ucs4(field_types[f].descr, + str, end, item_ptr, pconfig) < 0) { + err = true; + } + } + else { + if (to_generic_with_converter(field_types[f].descr, + str, end, item_ptr, pconfig, conv_funcs[i]) < 0) { + err = true; + } + } + + if (NPY_UNLIKELY(err)) { + PyObject *exc, *val, *tb; + PyErr_Fetch(&exc, &val, &tb); + + size_t length = end - str; + PyObject *string = PyUnicode_FromKindAndData( + PyUnicode_4BYTE_KIND, str, length); + if (string == NULL) { + npy_PyErr_ChainExceptions(exc, val, tb); + goto error; + } + PyErr_Format(PyExc_ValueError, + "could not convert string %.100R to %S at " + "row %zu, column %d.", + string, field_types[f].descr, row_count, col+1); + Py_DECREF(string); + npy_PyErr_ChainExceptionsCause(exc, val, tb); + goto error; + } + } + + ++row_count; + data_ptr += row_size; + } + + tokenizer_clear(&ts); + PyMem_FREE(conv_funcs); + + if (data_array == NULL) { + assert(row_count == 0 && result_shape[0] == 0); + if (actual_num_fields == -1) { + /* + * We found no rows and have to discover the number of elements + * we have no choice but to guess 1. + * NOTE: It may make sense to move this outside of here to refine + * the behaviour where necessary. + */ + result_shape[1] = 1; + } + else { + result_shape[1] = actual_num_fields; + } + Py_INCREF(out_descr); + data_array = (PyArrayObject *)PyArray_Empty( + ndim, result_shape, out_descr, 0); + } + + /* + * Note that if there is no data, `data_array` may still be NULL and + * row_count is 0. In that case, always realloc just in case. + */ + if (data_array_allocated && data_allocated_rows != row_count) { + size_t size = row_count * row_size; + char *new_data = PyDataMem_RENEW( + PyArray_BYTES(data_array), size ? size : 1); + if (new_data == NULL) { + Py_DECREF(data_array); + PyErr_NoMemory(); + return NULL; + } + ((PyArrayObject_fields *)data_array)->data = new_data; + ((PyArrayObject_fields *)data_array)->dimensions[0] = row_count; + } + + return data_array; + + error: + PyMem_FREE(conv_funcs); + tokenizer_clear(&ts); + Py_XDECREF(data_array); + return NULL; +} diff --git a/numpy/core/src/multiarray/textreading/rows.h b/numpy/core/src/multiarray/textreading/rows.h new file mode 100644 index 000000000..773e0f8e0 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/rows.h @@ -0,0 +1,22 @@ + +#ifndef _ROWS_H_ +#define _ROWS_H_ + +#define PY_SSIZE_T_CLEAN +#include <Python.h> +#include <stdio.h> + +#include "textreading/stream.h" +#include "textreading/field_types.h" +#include "textreading/parser_config.h" + + +PyArrayObject * +read_rows(stream *s, + npy_intp nrows, int num_field_types, field_type *field_types, + parser_config *pconfig, int num_usecols, int *usecols, + Py_ssize_t skiplines, PyObject *converters, + PyArrayObject *data_array, PyArray_Descr *out_descr, + bool homogeneous); + +#endif diff --git a/numpy/core/src/multiarray/textreading/str_to_int.c b/numpy/core/src/multiarray/textreading/str_to_int.c new file mode 100644 index 000000000..647e79a4f --- /dev/null +++ b/numpy/core/src/multiarray/textreading/str_to_int.c @@ -0,0 +1,87 @@ + +#include <Python.h> + +#include <string.h> +#include "textreading/str_to_int.h" +#include "textreading/conversions.h" +#include "textreading/parser_config.h" + + +NPY_NO_EXPORT PyArray_Descr *double_descr = NULL; + +// TODO: The float fallbacks are seriously awkward, why? Or at least why this way? +#define DECLARE_TO_INT(intw, INT_MIN, INT_MAX) \ + int \ + to_##intw(PyArray_Descr *descr, \ + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \ + parser_config *pconfig) \ + { \ + int64_t parsed; \ + intw##_t x; \ + \ + if (str_to_int64(str, end, INT_MIN, INT_MAX, &parsed) < 0) { \ + if (pconfig->allow_float_for_int) { \ + double fx; \ + if (to_double(double_descr, str, end, (char *)&fx, pconfig) < 0) { \ + return -1; \ + } \ + else { \ + x = (intw##_t) fx; \ + } \ + } \ + else { \ + return -1; \ + } \ + } \ + else { \ + x = (intw##_t)parsed; \ + } \ + memcpy(dataptr, &x, sizeof(x)); \ + if (!PyArray_ISNBO(descr->byteorder)) { \ + descr->f->copyswap(dataptr, dataptr, 1, NULL); \ + } \ + return 0; \ + } + +#define DECLARE_TO_UINT(uintw, UINT_MAX) \ + int \ + to_##uintw(PyArray_Descr *descr, \ + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \ + parser_config *pconfig) \ + { \ + uint64_t parsed; \ + uintw##_t x; \ + \ + if (str_to_uint64(str, end, UINT_MAX, &parsed) < 0) { \ + if (pconfig->allow_float_for_int) { \ + double fx; \ + if (to_double(double_descr, str, end, (char *)&fx, pconfig) < 0) { \ + return -1; \ + } \ + else { \ + x = (uintw##_t) fx; \ + } \ + } \ + else { \ + return -1; \ + } \ + } \ + else { \ + x = (uintw##_t)parsed; \ + } \ + memcpy(dataptr, &x, sizeof(x)); \ + if (!PyArray_ISNBO(descr->byteorder)) { \ + descr->f->copyswap(dataptr, dataptr, 1, NULL); \ + } \ + return 0; \ + } + +DECLARE_TO_INT(int8, INT8_MIN, INT8_MAX) +DECLARE_TO_INT(int16, INT16_MIN, INT16_MAX) +DECLARE_TO_INT(int32, INT32_MIN, INT32_MAX) +DECLARE_TO_INT(int64, INT64_MIN, INT64_MAX) + +DECLARE_TO_UINT(uint8, UINT8_MAX) +DECLARE_TO_UINT(uint16, UINT16_MAX) +DECLARE_TO_UINT(uint32, UINT32_MAX) +DECLARE_TO_UINT(uint64, UINT64_MAX) diff --git a/numpy/core/src/multiarray/textreading/str_to_int.h b/numpy/core/src/multiarray/textreading/str_to_int.h new file mode 100644 index 000000000..9cead56f0 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/str_to_int.h @@ -0,0 +1,175 @@ +#ifndef STR_TO_INT_H +#define STR_TO_INT_H + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/ndarraytypes.h" + +#include "textreading/parser_config.h" + +extern NPY_NO_EXPORT PyArray_Descr *double_descr; + +/* + * The following two string conversion functions are largely equivalent + * in Pandas. They are in the header file here, to ensure they can be easily + * inline in the other function. + * Unlike pandas, pass in end-pointer (do not rely on \0) and return 0 or -1. + * + * The actual functions are defined using macro templating below. + */ +static NPY_INLINE int +str_to_int64( + const Py_UCS4 *p_item, const Py_UCS4 *p_end, + int64_t int_min, int64_t int_max, int64_t *result) +{ + const Py_UCS4 *p = (const Py_UCS4 *)p_item; + bool isneg = 0; + int64_t number = 0; + + // Skip leading spaces. + while (Py_UNICODE_ISSPACE(*p)) { + ++p; + } + + // Handle sign. + if (*p == '-') { + isneg = true; + ++p; + } + else if (*p == '+') { + p++; + } + + // Check that there is a first digit. + if (!isdigit(*p)) { + return -1; + } + + if (isneg) { + // If number is greater than pre_min, at least one more digit + // can be processed without overflowing. + int dig_pre_min = -(int_min % 10); + int64_t pre_min = int_min / 10; + + // Process the digits. + int d = *p; + while (isdigit(d)) { + if ((number > pre_min) || ((number == pre_min) && (d - '0' <= dig_pre_min))) { + number = number * 10 - (d - '0'); + d = *++p; + } + else { + return -1; + } + } + } + else { + // If number is less than pre_max, at least one more digit + // can be processed without overflowing. + int64_t pre_max = int_max / 10; + int dig_pre_max = int_max % 10; + + // Process the digits. + int d = *p; + while (isdigit(d)) { + if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { + number = number * 10 + (d - '0'); + d = *++p; + } + else { + return -1; + } + } + } + + // Skip trailing spaces. + while (Py_UNICODE_ISSPACE(*p)) { + ++p; + } + + // Did we use up all the characters? + if (p != p_end) { + return -1; + } + + *result = number; + return 0; +} + + +static NPY_INLINE int +str_to_uint64( + const Py_UCS4 *p_item, const Py_UCS4 *p_end, + uint64_t uint_max, uint64_t *result) +{ + const Py_UCS4 *p = (const Py_UCS4 *)p_item; + uint64_t number = 0; + int d; + + // Skip leading spaces. + while (Py_UNICODE_ISSPACE(*p)) { + ++p; + } + + // Handle sign. + if (*p == '-') { + return -1; + } + if (*p == '+') { + p++; + } + + // Check that there is a first digit. + if (!isdigit(*p)) { + return -1; + } + + // If number is less than pre_max, at least one more digit + // can be processed without overflowing. + uint64_t pre_max = uint_max / 10; + int dig_pre_max = uint_max % 10; + + // Process the digits. + d = *p; + while (isdigit(d)) { + if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { + number = number * 10 + (d - '0'); + d = *++p; + } + else { + return -1; + } + } + + // Skip trailing spaces. + while (Py_UNICODE_ISSPACE(*p)) { + ++p; + } + + // Did we use up all the characters? + if (p != p_end) { + return -1; + } + + *result = number; + return 0; +} + + +#define DECLARE_TO_INT_PROTOTYPE(intw) \ + int \ + to_##intw(PyArray_Descr *descr, \ + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \ + parser_config *pconfig); + +DECLARE_TO_INT_PROTOTYPE(int8) +DECLARE_TO_INT_PROTOTYPE(int16) +DECLARE_TO_INT_PROTOTYPE(int32) +DECLARE_TO_INT_PROTOTYPE(int64) + +DECLARE_TO_INT_PROTOTYPE(uint8) +DECLARE_TO_INT_PROTOTYPE(uint16) +DECLARE_TO_INT_PROTOTYPE(uint32) +DECLARE_TO_INT_PROTOTYPE(uint64) + +#endif diff --git a/numpy/core/src/multiarray/textreading/stream.h b/numpy/core/src/multiarray/textreading/stream.h new file mode 100644 index 000000000..0c4567329 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/stream.h @@ -0,0 +1,29 @@ +#ifndef _STREAM_H_ +#define _STREAM_H_ + +#include <stdint.h> + +/* + * When getting the next line, we hope that the buffer provider can already + * give some information about the newlines, because for Python iterables + * we definitely expect to get line-by-line buffers. + */ +#define BUFFER_MAY_CONTAIN_NEWLINE 0 +#define BUFFER_IS_PARTIAL_LINE 1 +#define BUFFER_IS_LINEND 2 +#define BUFFER_IS_FILEEND 3 + +typedef struct _stream { + void *stream_data; + int (*stream_nextbuf)(void *sdata, char **start, char **end, int *kind); + // Note that the first argument to stream_close is the stream pointer + // itself, not the stream_data pointer. + int (*stream_close)(struct _stream *strm); +} stream; + + +#define stream_nextbuf(s, start, end, kind) \ + ((s)->stream_nextbuf((s)->stream_data, start, end, kind)) +#define stream_close(s) ((s)->stream_close((s))) + +#endif diff --git a/numpy/core/src/multiarray/textreading/stream_pyobject.c b/numpy/core/src/multiarray/textreading/stream_pyobject.c new file mode 100644 index 000000000..ccc902657 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/stream_pyobject.c @@ -0,0 +1,271 @@ +/* + * C side structures to provide capabilities to read Python file like objects + * in chunks, or iterate through iterables with each result representing a + * single line of a file. + */ + +#include <stdio.h> +#include <stdlib.h> + +#define PY_SSIZE_T_CLEAN +#include <Python.h> +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/arrayobject.h" + +#include "textreading/stream.h" + +#define READ_CHUNKSIZE 1 << 14 + + +typedef struct { + /* The Python file object being read. */ + PyObject *file; + + /* The `read` attribute of the file object. */ + PyObject *read; + /* Amount to read each time we call `obj.read()` */ + PyObject *chunksize; + + /* file position when the file_buffer was created. */ + off_t initial_file_pos; + + /* Python str object holding the line most recently read from the file. */ + PyObject *chunk; + + /* Encoding compatible with Python's `PyUnicode_Encode` (may be NULL) */ + const char *encoding; +} python_chunks_from_file; + + +/* + * Helper function to support byte objects as well as unicode strings. + * + * NOTE: Steals a reference to `str` (although usually returns it unmodified). + */ +static NPY_INLINE PyObject * +process_stringlike(PyObject *str, const char *encoding) +{ + if (PyBytes_Check(str)) { + PyObject *ustr; + ustr = PyUnicode_FromEncodedObject(str, encoding, NULL); + if (ustr == NULL) { + return NULL; + } + Py_DECREF(str); + return ustr; + } + else if (!PyUnicode_Check(str)) { + PyErr_SetString(PyExc_TypeError, + "non-string returned while reading data"); + Py_DECREF(str); + return NULL; + } + return str; +} + + +static NPY_INLINE void +buffer_info_from_unicode(PyObject *str, char **start, char **end, int *kind) +{ + Py_ssize_t length = PyUnicode_GET_LENGTH(str); + *kind = PyUnicode_KIND(str); + + if (*kind == PyUnicode_1BYTE_KIND) { + *start = (char *)PyUnicode_1BYTE_DATA(str); + } + else if (*kind == PyUnicode_2BYTE_KIND) { + *start = (char *)PyUnicode_2BYTE_DATA(str); + length *= sizeof(Py_UCS2); + } + else if (*kind == PyUnicode_4BYTE_KIND) { + *start = (char *)PyUnicode_4BYTE_DATA(str); + length *= sizeof(Py_UCS4); + } + *end = *start + length; +} + + +static int +fb_nextbuf(python_chunks_from_file *fb, char **start, char **end, int *kind) +{ + Py_XDECREF(fb->chunk); + fb->chunk = NULL; + + PyObject *chunk = PyObject_CallFunctionObjArgs(fb->read, fb->chunksize, NULL); + if (chunk == NULL) { + return -1; + } + fb->chunk = process_stringlike(chunk, fb->encoding); + if (fb->chunk == NULL) { + return -1; + } + buffer_info_from_unicode(fb->chunk, start, end, kind); + if (*start == *end) { + return BUFFER_IS_FILEEND; + } + return BUFFER_MAY_CONTAIN_NEWLINE; +} + + +static int +fb_del(stream *strm) +{ + python_chunks_from_file *fb = (python_chunks_from_file *)strm->stream_data; + + Py_XDECREF(fb->file); + Py_XDECREF(fb->read); + Py_XDECREF(fb->chunksize); + Py_XDECREF(fb->chunk); + + free(fb); + free(strm); + + return 0; +} + + +stream * +stream_python_file(PyObject *obj, const char *encoding) +{ + python_chunks_from_file *fb; + stream *strm; + + fb = (python_chunks_from_file *) malloc(sizeof(python_chunks_from_file)); + if (fb == NULL) { + PyErr_NoMemory(); + return NULL; + } + + fb->file = NULL; + fb->read = NULL; + fb->chunksize = NULL; + fb->chunk = NULL; + fb->encoding = encoding; + + strm = (stream *) malloc(sizeof(stream)); + if (strm == NULL) { + PyErr_NoMemory(); + free(fb); + return NULL; + } + + fb->file = obj; + Py_INCREF(fb->file); + + fb->read = PyObject_GetAttrString(obj, "read"); + if (fb->read == NULL) { + goto fail; + } + fb->chunksize = PyLong_FromLong(READ_CHUNKSIZE); + if (fb->chunksize == NULL) { + goto fail; + } + + strm->stream_data = (void *)fb; + strm->stream_nextbuf = (void *)&fb_nextbuf; + strm->stream_close = &fb_del; + + return strm; + +fail: + fb_del(strm); + return NULL; +} + + +/* + * Stream from a Python iterable by interpreting each item as a line in a file + */ +typedef struct { + /* The Python file object being read. */ + PyObject *iterator; + + /* Python str object holding the line most recently fetched */ + PyObject *line; + + /* Encoding compatible with Python's `PyUnicode_Encode` (may be NULL) */ + const char *encoding; +} python_lines_from_iterator; + + +static int +it_del(stream *strm) +{ + python_lines_from_iterator *it = (python_lines_from_iterator *)strm->stream_data; + + Py_XDECREF(it->iterator); + Py_XDECREF(it->line); + + free(it); + free(strm); + + return 0; +} + + +static int +it_nextbuf(python_lines_from_iterator *it, char **start, char **end, int *kind) +{ + Py_XDECREF(it->line); + it->line = NULL; + + PyObject *line = PyIter_Next(it->iterator); + if (line == NULL) { + if (PyErr_Occurred()) { + return -1; + } + *start = NULL; + *end = NULL; + return BUFFER_IS_FILEEND; + } + it->line = process_stringlike(line, it->encoding); + if (it->line == NULL) { + return -1; + } + + buffer_info_from_unicode(it->line, start, end, kind); + return BUFFER_IS_LINEND; +} + + +stream * +stream_python_iterable(PyObject *obj, const char *encoding) +{ + python_lines_from_iterator *it; + stream *strm; + + it = (python_lines_from_iterator *)malloc(sizeof(*it)); + if (it == NULL) { + PyErr_NoMemory(); + return NULL; + } + + it->iterator = NULL; + it->line = NULL; + it->encoding = encoding; + + strm = (stream *) malloc(sizeof(stream)); + if (strm == NULL) { + PyErr_NoMemory(); + free(it); + return NULL; + } + if (!PyIter_Check(obj)) { + PyErr_SetString(PyExc_TypeError, + "error reading from object, expected an iterable."); + goto fail; + } + Py_INCREF(obj); + it->iterator = obj; + + strm->stream_data = (void *)it; + strm->stream_nextbuf = (void *)&it_nextbuf; + strm->stream_close = &it_del; + + return strm; + +fail: + it_del(strm); + return NULL; +} diff --git a/numpy/core/src/multiarray/textreading/stream_pyobject.h b/numpy/core/src/multiarray/textreading/stream_pyobject.h new file mode 100644 index 000000000..93357e352 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/stream_pyobject.h @@ -0,0 +1,16 @@ + +#ifndef _STREAM_PYTHON_FILE_BY_LINE +#define _STREAM_PYTHON_FILE_BY_LINE + +#define PY_SSIZE_T_CLEAN +#include <Python.h> + +#include "textreading/stream.h" + +stream * +stream_python_file(PyObject *obj, const char *encoding); + +stream * +stream_python_iterable(PyObject *obj, const char *encoding); + +#endif diff --git a/numpy/core/src/multiarray/textreading/tokenize.c.src b/numpy/core/src/multiarray/textreading/tokenize.c.src new file mode 100644 index 000000000..dcddb1b36 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/tokenize.c.src @@ -0,0 +1,449 @@ + +#include <Python.h> + +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/ndarraytypes.h" + +#include "textreading/stream.h" +#include "textreading/tokenize.h" +#include "textreading/parser_config.h" +#include "textreading/growth.h" + + +/* + How parsing quoted fields works: + + For quoting to be activated, the first character of the field + must be the quote character (after taking into account + ignore_leading_spaces). While quoting is active, delimiters + are treated as regular characters, not delimiters. Quoting is + deactivated by the second occurrence of the quote character. An + exception is the occurrence of two consecutive quote characters, + which is treated as a literal occurrence of a single quote character. + E.g. (with delimiter=',' and quote='"'): + 12.3,"New York, NY","3'2""" + The second and third fields are `New York, NY` and `3'2"`. + + If a non-delimiter occurs after the closing quote, the quote is + ignored and parsing continues with quoting deactivated. Quotes + that occur while quoting is not activated are not handled specially; + they become part of the data. + E.g: + 12.3,"ABC"DEF,XY"Z + The second and third fields are `ABCDEF` and `XY"Z`. + + Note that the second field of + 12.3,"ABC" ,4.5 + is `ABC `. Currently there is no option to ignore whitespace + at the end of a field. +*/ + + +/**begin repeat + * #type = Py_UCS1, Py_UCS2, Py_UCS4# + */ +static NPY_INLINE int +copy_to_field_buffer_@type@(tokenizer_state *ts, + const @type@ *chunk_start, const @type@ *chunk_end) +{ + size_t chunk_length = chunk_end - chunk_start; + size_t size = chunk_length + ts->field_buffer_pos + 2; + + if (NPY_UNLIKELY(ts->field_buffer_length < size)) { + npy_intp alloc_size = grow_size_and_multiply(&size, 32, sizeof(Py_UCS4)); + if (alloc_size < 0) { + PyErr_Format(PyExc_ValueError, + "line too long to handle while reading file."); + return -1; + } + Py_UCS4 *grown = PyMem_Realloc(ts->field_buffer, alloc_size); + if (grown == NULL) { + PyErr_NoMemory(); + return -1; + } + ts->field_buffer_length = size; + ts->field_buffer = grown; + } + + Py_UCS4 *write_pos = ts->field_buffer + ts->field_buffer_pos; + for (; chunk_start < chunk_end; chunk_start++, write_pos++) { + *write_pos = (Py_UCS4)*chunk_start; + } + *write_pos = '\0'; /* always ensure we end with NUL */ + ts->field_buffer_pos += chunk_length; + return 0; +} +/**end repeat**/ + + +static NPY_INLINE int +add_field(tokenizer_state *ts) +{ + /* The previous field is done, advance to keep a NUL byte at the end */ + ts->field_buffer_pos += 1; + + if (NPY_UNLIKELY((size_t)ts->num_fields + 1 > ts->fields_size)) { + size_t size = (size_t)ts->num_fields; + + npy_intp alloc_size = grow_size_and_multiply( + &size, 4, sizeof(field_info)); + if (alloc_size < 0) { + /* Check for a size overflow, path should be almost impossible. */ + PyErr_Format(PyExc_ValueError, + "too many columns found; cannot read file."); + return -1; + } + field_info *fields = PyMem_Realloc(ts->fields, alloc_size); + if (fields == NULL) { + PyErr_NoMemory(); + return -1; + } + ts->fields = fields; + ts->fields_size = size; + } + + ts->fields[ts->num_fields].offset = ts->field_buffer_pos; + ts->fields[ts->num_fields].quoted = false; + ts->num_fields += 1; + /* Ensure this (currently empty) word is NUL terminated. */ + ts->field_buffer[ts->field_buffer_pos] = '\0'; + return 0; +} + + +/**begin repeat + * #kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND, PyUnicode_4BYTE_KIND# + * #type = Py_UCS1, Py_UCS2, Py_UCS4# + */ +static NPY_INLINE int +tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config) +{ + @type@ *pos = (@type@ *)ts->pos; + @type@ *stop = (@type@ *)ts->end; + @type@ *chunk_start; + + if (ts->state == TOKENIZE_CHECK_QUOTED) { + /* before we can check for quotes, strip leading whitespace */ + if (config->ignore_leading_whitespace) { + while (pos < stop && Py_UNICODE_ISSPACE(*pos) && + *pos != '\r' && *pos != '\n') { + pos++; + } + if (pos == stop) { + ts->pos = (char *)pos; + return 0; + } + } + + /* Setting chunk effectively starts the field */ + if (*pos == config->quote) { + ts->fields[ts->num_fields - 1].quoted = true; + ts->state = TOKENIZE_QUOTED; + pos++; /* TOKENIZE_QUOTED is OK with pos == stop */ + } + else { + /* Set to TOKENIZE_QUOTED or TOKENIZE_QUOTED_WHITESPACE */ + ts->state = ts->unquoted_state; + } + } + + switch (ts->state) { + case TOKENIZE_UNQUOTED: + chunk_start = pos; + for (; pos < stop; pos++) { + if (*pos == '\r') { + ts->state = TOKENIZE_EAT_CRLF; + break; + } + else if (*pos == '\n') { + ts->state = TOKENIZE_LINE_END; + break; + } + else if (*pos == config->delimiter) { + ts->state = TOKENIZE_INIT; + break; + } + else if (*pos == config->comment) { + ts->state = TOKENIZE_GOTO_LINE_END; + break; + } + } + if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) { + return -1; + } + pos++; + break; + + case TOKENIZE_UNQUOTED_WHITESPACE: + /* Note, this branch is largely identical to `TOKENIZE_UNQUOTED` */ + chunk_start = pos; + for (; pos < stop; pos++) { + if (*pos == '\r') { + ts->state = TOKENIZE_EAT_CRLF; + break; + } + else if (*pos == '\n') { + ts->state = TOKENIZE_LINE_END; + break; + } + else if (Py_UNICODE_ISSPACE(*pos)) { + ts->state = TOKENIZE_INIT; + break; + } + else if (*pos == config->comment) { + ts->state = TOKENIZE_GOTO_LINE_END; + break; + } + } + if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) { + return -1; + } + pos++; + break; + + case TOKENIZE_QUOTED: + chunk_start = pos; + for (; pos < stop; pos++) { + if (!config->allow_embedded_newline) { + if (*pos == '\r') { + ts->state = TOKENIZE_EAT_CRLF; + break; + } + else if (*pos == '\n') { + ts->state = TOKENIZE_LINE_END; + break; + } + } + else if (*pos != config->quote) { + /* inside the field, nothing to do. */ + } + else { + ts->state = TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE; + break; + } + } + if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) { + return -1; + } + pos++; + break; + + case TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE: + if (*pos == config->quote) { + ts->state = TOKENIZE_QUOTED; + pos++; + } + else { + /* continue parsing as if unquoted */ + ts->state = TOKENIZE_UNQUOTED; + } + break; + + case TOKENIZE_GOTO_LINE_END: + if (ts->buf_state != BUFFER_MAY_CONTAIN_NEWLINE) { + pos = stop; /* advance to next buffer */ + ts->state = TOKENIZE_LINE_END; + break; + } + for (; pos < stop; pos++) { + if (*pos == '\r') { + ts->state = TOKENIZE_EAT_CRLF; + break; + } + else if (*pos == '\n') { + ts->state = TOKENIZE_LINE_END; + break; + } + } + pos++; + break; + + case TOKENIZE_EAT_CRLF: + /* "Universal newline" support: remove \n in \r\n. */ + if (*pos == '\n') { + pos++; + } + ts->state = TOKENIZE_LINE_END; + break; + + default: + assert(0); + } + + ts->pos = (char *)pos; + return 0; +} +/**end repeat**/ + + +/* + * This version now always copies the full "row" (all tokens). This makes + * two things easier: + * 1. It means that every word is guaranteed to be followed by a NUL character + * (although it can include one as well). + * 2. In the usecols case we can sniff the first row easier by parsing it + * fully. + * + * The tokenizer could grow the ability to skip fields and check the + * maximum number of fields when known. + * + * Unlike other tokenizers, this one tries to work in chunks and copies + * data to words only when it it has to. The hope is that this makes multiple + * light-weight loops rather than a single heavy one, to allow e.g. quickly + * scanning for the end of a field. + */ +int +tokenize(stream *s, tokenizer_state *ts, parser_config *const config) +{ + assert(ts->fields_size >= 2); + assert(ts->field_buffer_length >= 2*sizeof(Py_UCS4)); + + int finished_reading_file = 0; + + /* Reset to start of buffer */ + ts->field_buffer_pos = 0; + ts->num_fields = 0; + /* Add the first field */ + + while (1) { + if (ts->state == TOKENIZE_INIT) { + /* Start a new field */ + if (add_field(ts) < 0) { + return -1; + } + ts->state = TOKENIZE_CHECK_QUOTED; + } + + if (NPY_UNLIKELY(ts->pos >= ts->end)) { + if (ts->buf_state == BUFFER_IS_LINEND && + ts->state != TOKENIZE_QUOTED && + ts->state != TOKENIZE_CHECK_QUOTED) { + /* + * Finished line, do not read anymore (also do not eat \n). + * If we are in a quoted field and the "line" does not end with + * a newline, the quoted field will be missing it right now. + * TODO: We should probably just insert a "\n" character here, + * which is also closer to what the python code did + * (either by setting pos/end or manually). + */ + goto finish; + } + /* fetch new data */ + ts->buf_state = stream_nextbuf(s, + &ts->pos, &ts->end, &ts->unicode_kind); + if (ts->buf_state < 0) { + return -1; + } + if (ts->buf_state == BUFFER_IS_FILEEND) { + finished_reading_file = 1; + ts->pos = ts->end; /* should be guaranteed, but make sure. */ + goto finish; + } + else if (ts->pos == ts->end) { + if (ts->buf_state != BUFFER_IS_LINEND) { + PyErr_SetString(PyExc_RuntimeError, + "Reader returned an empty buffer, " + "but did not indicate file or line end."); + return -1; + } + /* Otherwise, we are OK with this and assume an empty line. */ + goto finish; + } + } + int status; + if (ts->unicode_kind == PyUnicode_1BYTE_KIND) { + status = tokenizer_core_Py_UCS1(ts, config); + } + else if (ts->unicode_kind == PyUnicode_2BYTE_KIND) { + status = tokenizer_core_Py_UCS2(ts, config); + } + else { + assert(ts->unicode_kind == PyUnicode_4BYTE_KIND); + status = tokenizer_core_Py_UCS4(ts, config); + } + if (status < 0) { + return -1; + } + + if (ts->state == TOKENIZE_LINE_END) { + goto finish; + } + } + + finish: + /* Finish the last field */ + if (add_field(ts) < 0) { + return -1; + } + ts->num_fields -= 1; + /* + * If have one field, but that field is completely empty, this is an + * empty line, and we just ignore it. + */ + if (ts->num_fields == 1 + && ts->fields[1].offset - ts->fields[0].offset == 1 + && !ts->fields->quoted) { + ts->num_fields--; + } + ts->state = TOKENIZE_INIT; + return finished_reading_file; +} + + +void +tokenizer_clear(tokenizer_state *ts) +{ + PyMem_FREE(ts->field_buffer); + ts->field_buffer = NULL; + ts->field_buffer_length = 0; + + PyMem_FREE(ts->fields); + ts->fields = NULL; + ts->fields_size = 0; +} + + +/* + * Initialize the tokenizer. We may want to copy all important config + * variables into the tokenizer. This would improve the cache locality during + * tokenizing. + */ +int +tokenizer_init(tokenizer_state *ts, parser_config *config) +{ + /* State and buf_state could be moved into tokenize if we go by row */ + ts->buf_state = BUFFER_MAY_CONTAIN_NEWLINE; + ts->state = TOKENIZE_INIT; + if (config->delimiter_is_whitespace) { + ts->unquoted_state = TOKENIZE_UNQUOTED_WHITESPACE; + } + else { + ts->unquoted_state = TOKENIZE_UNQUOTED; + } + ts->num_fields = 0; + + ts->buf_state = 0; + ts->pos = NULL; + ts->end = NULL; + + ts->field_buffer = PyMem_Malloc(32 * sizeof(Py_UCS4)); + if (ts->field_buffer == NULL) { + PyErr_NoMemory(); + return -1; + } + ts->field_buffer_length = 32; + + ts->fields = PyMem_Malloc(4 * sizeof(*ts->fields)); + if (ts->fields == NULL) { + PyErr_NoMemory(); + return -1; + } + ts->fields_size = 4; + return 0; +} diff --git a/numpy/core/src/multiarray/textreading/tokenize.h b/numpy/core/src/multiarray/textreading/tokenize.h new file mode 100644 index 000000000..aeac63107 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/tokenize.h @@ -0,0 +1,77 @@ + +#ifndef _TOKENIZE_H_ +#define _TOKENIZE_H_ + +#include <Python.h> +#include "textreading/stream.h" +#include "textreading/parser_config.h" + + +typedef enum { + /* Initialization of fields */ + TOKENIZE_INIT, + TOKENIZE_CHECK_QUOTED, + /* Main field parsing states */ + TOKENIZE_UNQUOTED, + TOKENIZE_UNQUOTED_WHITESPACE, + TOKENIZE_QUOTED, + /* Handling of two character control sequences (except "\r\n") */ + TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE, + /* Line end handling */ + TOKENIZE_LINE_END, + TOKENIZE_EAT_CRLF, /* "\r\n" support (carriage return, line feed) */ + TOKENIZE_GOTO_LINE_END, +} tokenizer_parsing_state; + + + +typedef struct { + size_t offset; + bool quoted; +} field_info; + + +typedef struct { + tokenizer_parsing_state state; + /* Either TOKENIZE_UNQUOTED or TOKENIZE_UNQUOTED_WHITESPACE: */ + tokenizer_parsing_state unquoted_state; + int unicode_kind; + int buf_state; + size_t num_fields; + /* the buffer we are currently working on */ + char *pos; + char *end; + /* + * Space to copy words into. The buffer must always be at least two NUL + * entries longer (8 bytes) than the actual word (including initially). + * The first byte beyond the current word is always NUL'ed on write, the + * second byte is there to allow easy appending of an additional empty + * word at the end (this word is also NUL terminated). + */ + size_t field_buffer_length; + size_t field_buffer_pos; + Py_UCS4 *field_buffer; + + /* + * Fields, including information about the field being quoted. This + * always includes one "additional" empty field. The length of a field + * is equal to `fields[i+1].offset - fields[i].offset - 1`. + * + * The tokenizer assumes at least one field is allocated. + */ + field_info *fields; + size_t fields_size; +} tokenizer_state; + + +void +tokenizer_clear(tokenizer_state *ts); + + +int +tokenizer_init(tokenizer_state *ts, parser_config *config); + +int +tokenize(stream *s, tokenizer_state *ts, parser_config *const config); + +#endif diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index a6c2d4c2d..c2472f601 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -5,6 +5,7 @@ import itertools import warnings import weakref import contextlib +import operator from operator import itemgetter, index as opindex, methodcaller from collections.abc import Mapping @@ -13,6 +14,7 @@ from . import format from ._datasource import DataSource from numpy.core import overrides from numpy.core.multiarray import packbits, unpackbits +from numpy.core._multiarray_umath import _load_from_filelike from numpy.core.overrides import set_array_function_like_doc, set_module from ._iotools import ( LineSplitter, NameValidator, StringConverter, ConverterError, @@ -721,101 +723,6 @@ def _savez(file, args, kwds, compress, allow_pickle=True, pickle_kwargs=None): zipf.close() -def _floatconv(x): - try: - return float(x) # The fastest path. - except ValueError: - if '0x' in x: # Don't accidentally convert "a" ("0xa") to 10. - try: - return float.fromhex(x) - except ValueError: - pass - raise # Raise the original exception, which makes more sense. - - -_CONVERTERS = [ # These converters only ever get strs (not bytes) as input. - (np.bool_, lambda x: bool(int(x))), - (np.uint64, np.uint64), - (np.int64, np.int64), - (np.integer, lambda x: int(float(x))), - (np.longdouble, np.longdouble), - (np.floating, _floatconv), - (complex, lambda x: complex(x.replace('+-', '-'))), - (np.bytes_, methodcaller('encode', 'latin-1')), - (np.unicode_, str), -] - - -def _getconv(dtype): - """ - Find the correct dtype converter. Adapted from matplotlib. - - Even when a lambda is returned, it is defined at the toplevel, to allow - testing for equality and enabling optimization for single-type data. - """ - for base, conv in _CONVERTERS: - if issubclass(dtype.type, base): - return conv - return str - - -# _loadtxt_flatten_dtype_internal and _loadtxt_pack_items are loadtxt helpers -# lifted to the toplevel because recursive inner functions cause either -# GC-dependent reference loops (because they are closures over loadtxt's -# internal variables) or large overheads if using a manual trampoline to hide -# the recursive calls. - - -# not to be confused with the flatten_dtype we import... -def _loadtxt_flatten_dtype_internal(dt): - """Unpack a structured data-type, and produce a packer function.""" - if dt.names is None: - # If the dtype is flattened, return. - # If the dtype has a shape, the dtype occurs - # in the list more than once. - shape = dt.shape - if len(shape) == 0: - return ([dt.base], None) - else: - packing = [(shape[-1], list)] - if len(shape) > 1: - for dim in dt.shape[-2::-1]: - packing = [(dim*packing[0][0], packing*dim)] - return ([dt.base] * int(np.prod(dt.shape)), - functools.partial(_loadtxt_pack_items, packing)) - else: - types = [] - packing = [] - for field in dt.names: - tp, bytes = dt.fields[field] - flat_dt, flat_packer = _loadtxt_flatten_dtype_internal(tp) - types.extend(flat_dt) - flat_packing = flat_packer.args[0] if flat_packer else None - # Avoid extra nesting for subarrays - if tp.ndim > 0: - packing.extend(flat_packing) - else: - packing.append((len(flat_dt), flat_packing)) - return (types, functools.partial(_loadtxt_pack_items, packing)) - - -def _loadtxt_pack_items(packing, items): - """Pack items into nested lists based on re-packing info.""" - if packing is None: - return items[0] - elif packing is tuple: - return tuple(items) - elif packing is list: - return list(items) - else: - start = 0 - ret = [] - for length, subpacking in packing: - ret.append( - _loadtxt_pack_items(subpacking, items[start:start+length])) - start += length - return tuple(ret) - def _ensure_ndmin_ndarray_check_param(ndmin): """Just checks if the param ndmin is supported on _ensure_ndmin_ndarray. Is intented to be used as @@ -859,6 +766,310 @@ def _loadtxt_dispatcher(fname, dtype=None, comments=None, delimiter=None, return (like,) +def _check_nonneg_int(value, name="argument"): + try: + operator.index(value) + except TypeError: + raise TypeError(f"{name} must be an integer") from None + if value < 0: + raise ValueError(f"{name} must be nonnegative") + + +def _preprocess_comments(iterable, comments, encoding): + """ + Generator that consumes a line iterated iterable and strips out the + multiple (or multi-character) comments from lines. + This is a pre-processing step to achieve feature parity with loadtxt + (we assume that this feature is a nieche feature). + """ + for line in iterable: + if isinstance(line, bytes): + # Need to handle conversion here, or the splitting would fail + line = line.decode(encoding) + + for c in comments: + line = line.split(c, 1)[0] + + yield line + + +# The number of rows we read in one go if confronted with a parametric dtype +_loadtxt_chunksize = 50000 + + +def _read(fname, *, delimiter=',', comment='#', quote='"', + imaginary_unit='j', usecols=None, skiprows=0, + max_rows=None, converters=None, ndmin=None, unpack=False, + dtype=np.float64, encoding="bytes"): + r""" + Read a NumPy array from a text file. + + Parameters + ---------- + fname : str or file object + The filename or the file to be read. + delimiter : str, optional + Field delimiter of the fields in line of the file. + Default is a comma, ','. + comment : str or sequence of str, optional + Character that begins a comment. All text from the comment + character to the end of the line is ignored. + Multiple comments or multiple-character comment strings are supported, + but may be slower and `quote` must be empty if used. + quote : str, optional + Character that is used to quote string fields. Default is '"' + (a double quote). + imaginary_unit : str, optional + Character that represent the imaginay unit `sqrt(-1)`. + Default is 'j'. + usecols : array_like, optional + A one-dimensional array of integer column numbers. These are the + columns from the file to be included in the array. If this value + is not given, all the columns are used. + skiprows : int, optional + Number of lines to skip before interpreting the data in the file. + max_rows : int, optional + Maximum number of rows of data to read. Default is to read the + entire file. + converters : dict, optional + A dictionary mapping column number to a function that will parse the + column string into the desired value. E.g. if column 0 is a date + string: ``converters = {0: datestr2num}``. Converters can also be used + to provide a default value for missing data, e.g. + ``converters = {3: lambda s: float(s.strip() or 0)}``. + Default: None + ndmin : int, optional + Minimum dimension of the array returned. + Allowed values are 0, 1 or 2. Default is 0. + unpack : bool, optional + If True, the returned array is transposed, so that arguments may be + unpacked using ``x, y, z = read(...)``. When used with a structured + data-type, arrays are returned for each field. Default is False. + dtype : numpy data type + A NumPy dtype instance, can be a structured dtype to map to the + columns of the file. + encoding : str, optional + Encoding used to decode the inputfile. The special value 'bytes' + (the default) enables backwards-compatible behavior for `converters`, + ensuring that inputs to the converter functions are encoded + bytes objects. The special value 'bytes' has no additional effect if + ``converters=None``. If encoding is ``'bytes'`` or ``None``, the + default system encoding is used. + + Returns + ------- + ndarray + NumPy array. + + Examples + -------- + First we create a file for the example. + + >>> s1 = '1.0,2.0,3.0\n4.0,5.0,6.0\n' + >>> with open('example1.csv', 'w') as f: + ... f.write(s1) + >>> a1 = read_from_filename('example1.csv') + >>> a1 + array([[1., 2., 3.], + [4., 5., 6.]]) + + The second example has columns with different data types, so a + one-dimensional array with a structured data type is returned. + The tab character is used as the field delimiter. + + >>> s2 = '1.0\t10\talpha\n2.3\t25\tbeta\n4.5\t16\tgamma\n' + >>> with open('example2.tsv', 'w') as f: + ... f.write(s2) + >>> a2 = read_from_filename('example2.tsv', delimiter='\t') + >>> a2 + array([(1. , 10, b'alpha'), (2.3, 25, b'beta'), (4.5, 16, b'gamma')], + dtype=[('f0', '<f8'), ('f1', 'u1'), ('f2', 'S5')]) + """ + # Handle special 'bytes' keyword for encoding + byte_converters = False + if encoding == 'bytes': + encoding = None + byte_converters = True + + if dtype is None: + raise TypeError("a dtype must be provided.") + dtype = np.dtype(dtype) + + read_dtype_via_object_chunks = None + if dtype.kind in 'SUM' and ( + dtype == "S0" or dtype == "U0" or dtype == "M8" or dtype == 'm8'): + # This is a legacy "flexible" dtype. We do not truly support + # parametric dtypes currently (no dtype discovery step in the core), + # but have to support these for backward compatibility. + read_dtype_via_object_chunks = dtype + dtype = np.dtype(object) + + if usecols is not None: + # Allow usecols to be a single int or a sequence of ints + try: + usecols_as_list = list(usecols) + except TypeError: + usecols_as_list = [usecols] + for col_idx in usecols_as_list: + try: + operator.index(col_idx) + except TypeError: + # Some unit tests for numpy.loadtxt require that the + # error message matches this format. + raise TypeError( + "usecols must be an int or a sequence of ints but " + "it contains at least one element of type %s" % + type(col_idx), + ) from None + # Fall back to existing code + usecols = np.array([operator.index(i) for i in usecols_as_list], + dtype=np.int32) + + _ensure_ndmin_ndarray_check_param(ndmin) + + if not isinstance(comment, str): + # assume comments are a sequence of strings + comments = tuple(comment) + comment = '' + # If there is only one comment, and that comment has one character, + # the normal parsing can deal with it just fine. + if len(comments) == 1: + if isinstance(comments[0], str) and len(comments[0]) == 1: + comment = comments[0] + comments = None + elif len(comment) > 1: + comments = (comment,) + comment = '' + else: + comments = None + + # comment is now either a 1 or 0 character string or a tuple: + if comments is not None: + assert comment == '' + # Note: An earlier version support two character comments (and could + # have been extended to multiple characters, we assume this is + # rare enough to not optimize for. + if quote != "": + raise ValueError( + "when multiple comments or a multi-character comment is given, " + "quotes are not supported. In this case the quote character " + "must be set to the empty string: `quote=''`.") + else: + # No preprocessing necessary + assert comments is None + + if len(imaginary_unit) != 1: + raise ValueError('len(imaginary_unit) must be 1.') + + _check_nonneg_int(skiprows) + if max_rows is not None: + _check_nonneg_int(max_rows) + else: + # Passing -1 to the C code means "read the entire file". + max_rows = -1 + + fh_closing_ctx = contextlib.nullcontext() + filelike = False + try: + if isinstance(fname, os.PathLike): + fname = os.fspath(fname) + # TODO: loadtxt actually uses `file + ''` to decide this?! + if isinstance(fname, str): + fh = np.lib._datasource.open(fname, 'rt', encoding=encoding) + if encoding is None: + encoding = getattr(fh, 'encoding', 'latin1') + + fh_closing_ctx = contextlib.closing(fh) + data = fh + filelike = True + else: + if encoding is None: + encoding = getattr(fname, 'encoding', 'latin1') + data = iter(fname) + except TypeError as e: + raise ValueError( + f"fname must be a string, filehandle, list of strings,\n" + f"or generator. Got {type(fname)} instead.") from e + + with fh_closing_ctx: + if comments is not None: + if filelike: + data = iter(data) + filelike = False + data = _preprocess_comments(data, comments, encoding) + + if read_dtype_via_object_chunks is None: + arr = _load_from_filelike( + data, delimiter=delimiter, comment=comment, quote=quote, + imaginary_unit=imaginary_unit, + usecols=usecols, skiprows=skiprows, max_rows=max_rows, + converters=converters, dtype=dtype, + encoding=encoding, filelike=filelike, + byte_converters=byte_converters) + + else: + # This branch reads the file into chunks of object arrays and then + # casts them to the desired actual dtype. This ensures correct + # string-length and datetime-unit discovery (as for `arr.astype()`). + # Due to chunking, certain error reports are less clear, currently. + if filelike: + data = iter(data) # cannot chunk when reading from file + + c_byte_converters = False + if read_dtype_via_object_chunks == "S": + c_byte_converters = True # Use latin1 rather than ascii + + chunks = [] + while max_rows != 0: + if max_rows < 0: + chunk_size = _loadtxt_chunksize + else: + chunk_size = min(_loadtxt_chunksize, max_rows) + + next_arr = _load_from_filelike( + data, delimiter=delimiter, comment=comment, quote=quote, + imaginary_unit=imaginary_unit, + usecols=usecols, skiprows=skiprows, max_rows=max_rows, + converters=converters, dtype=dtype, + encoding=encoding, filelike=filelike, + byte_converters=byte_converters, + c_byte_converters=c_byte_converters) + # Cast here already. We hope that this is better even for + # large files because the storage is more compact. It could + # be adapted (in principle the concatenate could cast). + chunks.append(next_arr.astype(read_dtype_via_object_chunks)) + + skiprows = 0 # Only have to skip for first chunk + if max_rows >= 0: + max_rows -= chunk_size + if len(next_arr) < chunk_size: + # There was less data than requested, so we are done. + break + + # Need at least one chunk, but if empty, the last one may have + # the wrong shape. + if len(chunks) > 1 and len(chunks[-1]) == 0: + del chunks[-1] + if len(chunks) == 1: + arr = chunks[0] + else: + arr = np.concatenate(chunks, axis=0) + + arr = _ensure_ndmin_ndarray(arr, ndmin=ndmin) + + if unpack: + # Handle unpack like np.loadtxt. + # XXX Check interaction with ndmin! + dt = arr.dtype + if dt.names is not None: + # For structured arrays, return an array for each field. + return [arr[field] for field in dt.names] + else: + return arr.T + else: + return arr + + @set_array_function_like_doc @set_module('numpy') def loadtxt(fname, dtype=float, comments='#', delimiter=None, @@ -1000,228 +1211,29 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, max_rows=max_rows, like=like ) - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Nested functions used by loadtxt. - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def split_line(line: str): - """Chop off comments, strip, and split at delimiter.""" - for comment in comments: # Much faster than using a single regex. - line = line.split(comment, 1)[0] - line = line.strip('\r\n') - return line.split(delimiter) if line else [] + if delimiter is None: + delimiter = '' + elif isinstance(delimiter, bytes): + delimiter.decode("latin1") - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Main body of loadtxt. - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - _ensure_ndmin_ndarray_check_param(ndmin) + if dtype is None: + dtype = np.float64 + comment = comments # Type conversions for Py3 convenience - if comments is not None: - if isinstance(comments, (str, bytes)): - comments = [comments] - comments = [_decode_line(x) for x in comments] - else: - comments = [] - - if delimiter is not None: - delimiter = _decode_line(delimiter) - - user_converters = converters - - byte_converters = False - if encoding == 'bytes': - encoding = None - byte_converters = True - - if usecols is not None: - # Copy usecols, allowing it to be a single int or a sequence of ints. - try: - usecols = list(usecols) - except TypeError: - usecols = [usecols] - for i, col_idx in enumerate(usecols): - try: - usecols[i] = opindex(col_idx) # Cast to builtin int now. - except TypeError as e: - e.args = ( - "usecols must be an int or a sequence of ints but " - "it contains at least one element of type %s" % - type(col_idx), - ) - raise - if len(usecols) > 1: - usecols_getter = itemgetter(*usecols) - else: - # Get an iterable back, even if using a single column. - usecols_getter = lambda obj, c=usecols[0]: [obj[c]] + if comment is None: + comment = '' else: - usecols_getter = None - - # Make sure we're dealing with a proper dtype - dtype = np.dtype(dtype) - defconv = _getconv(dtype) - - dtype_types, packer = _loadtxt_flatten_dtype_internal(dtype) - - fh_closing_ctx = contextlib.nullcontext() - try: - if isinstance(fname, os_PathLike): - fname = os_fspath(fname) - if _is_string_like(fname): - fh = np.lib._datasource.open(fname, 'rt', encoding=encoding) - fencoding = getattr(fh, 'encoding', 'latin1') - line_iter = iter(fh) - fh_closing_ctx = contextlib.closing(fh) - else: - line_iter = iter(fname) - fencoding = getattr(fname, 'encoding', 'latin1') - try: - first_line = next(line_iter) - except StopIteration: - pass # Nothing matters if line_iter is empty. - else: - # Put first_line back. - line_iter = itertools.chain([first_line], line_iter) - if isinstance(first_line, bytes): - # Using latin1 matches _decode_line's behavior. - decoder = methodcaller( - "decode", - encoding if encoding is not None else "latin1") - line_iter = map(decoder, line_iter) - except TypeError as e: - raise ValueError( - f"fname must be a string, filehandle, list of strings,\n" - f"or generator. Got {type(fname)} instead." - ) from e - - with fh_closing_ctx: - - # input may be a python2 io stream - if encoding is not None: - fencoding = encoding - # we must assume local encoding - # TODO emit portability warning? - elif fencoding is None: - import locale - fencoding = locale.getpreferredencoding() - - # Skip the first `skiprows` lines - for i in range(skiprows): - next(line_iter) - - # Read until we find a line with some values, and use it to determine - # the need for decoding and estimate the number of columns. - for first_line in line_iter: - ncols = len(usecols or split_line(first_line)) - if ncols: - # Put first_line back. - line_iter = itertools.chain([first_line], line_iter) - break - else: # End of lines reached - ncols = len(usecols or []) - warnings.warn('loadtxt: Empty input file: "%s"' % fname, - stacklevel=2) - - line_iter = itertools.islice(line_iter, max_rows) - lineno_words_iter = filter( - itemgetter(1), # item[1] is words; filter skips empty lines. - enumerate(map(split_line, line_iter), 1 + skiprows)) - - # Now that we know ncols, create the default converters list, and - # set packing, if necessary. - if len(dtype_types) > 1: - # We're dealing with a structured array, each field of - # the dtype matches a column - converters = [_getconv(dt) for dt in dtype_types] - else: - # All fields have the same dtype; use specialized packers which are - # much faster than those using _loadtxt_pack_items. - converters = [defconv for i in range(ncols)] - if ncols == 1: - packer = itemgetter(0) - else: - def packer(row): return row - - # By preference, use the converters specified by the user - for i, conv in (user_converters or {}).items(): - if usecols: - try: - i = usecols.index(i) - except ValueError: - # Unused converter specified - continue - if byte_converters: - # converters may use decode to workaround numpy's old - # behaviour, so encode the string again (converters are only - # called with strings) before passing to the user converter. - def tobytes_first(conv, x): - return conv(x.encode("latin1")) - converters[i] = functools.partial(tobytes_first, conv) - else: - converters[i] = conv - - fencode = methodcaller("encode", fencoding) - converters = [conv if conv is not bytes else fencode - for conv in converters] - if len(set(converters)) == 1: - # Optimize single-type data. Note that this is only reached if - # `_getconv` returns equal callables (i.e. not local lambdas) on - # equal dtypes. - def convert_row(vals, _conv=converters[0]): - return [*map(_conv, vals)] - else: - def convert_row(vals): - return [conv(val) for conv, val in zip(converters, vals)] - - # read data in chunks and fill it into an array via resize - # over-allocating and shrinking the array later may be faster but is - # probably not relevant compared to the cost of actually reading and - # converting the data - X = None - while True: - chunk = [] - for lineno, words in itertools.islice( - lineno_words_iter, _loadtxt_chunksize): - if usecols_getter is not None: - words = usecols_getter(words) - elif len(words) != ncols: - raise ValueError( - f"Wrong number of columns at line {lineno}") - # Convert each value according to its column, then pack it - # according to the dtype's nesting, and store it. - chunk.append(packer(convert_row(words))) - if not chunk: # The islice is empty, i.e. we're done. - break - - if X is None: - X = np.array(chunk, dtype) - else: - nshape = list(X.shape) - pos = nshape[0] - nshape[0] += len(chunk) - X.resize(nshape, refcheck=False) - X[pos:, ...] = chunk - - if X is None: - X = np.array([], dtype) + if isinstance(comment, (str, bytes)): + comment = [comment] + comment = [x.decode('latin1') if isinstance(x, bytes) else x for x in comment] - # Multicolumn data are returned with shape (1, N, M), i.e. - # (1, 1, M) for a single row - remove the singleton dimension there - if X.ndim == 3 and X.shape[:2] == (1, 1): - X.shape = (1, -1) + arr = _read(fname, dtype=dtype, comment=comment, delimiter=delimiter, + converters=converters, skiprows=skiprows, usecols=usecols, + unpack=unpack, ndmin=ndmin, encoding=encoding, + max_rows=max_rows, quote='') - X = _ensure_ndmin_ndarray(X, ndmin=ndmin) - - if unpack: - if len(dtype_types) > 1: - # For structured arrays, return an array for each field. - return [X[field] for field in dtype.names] - else: - return X.T - else: - return X + return arr _loadtxt_with_like = array_function_dispatch( |