diff options
author | Armin Rigo <arigo@tunes.org> | 2012-09-22 21:50:03 +0200 |
---|---|---|
committer | Armin Rigo <arigo@tunes.org> | 2012-09-22 21:50:03 +0200 |
commit | 42b970f5cf60781d3b41916bb99b958c6bac53d8 (patch) | |
tree | 429fe2809fbe2ff759f03f2ad835c4f369bdbc6c /demo | |
parent | 0fc8332257f185155084b2d90b2e97f17534a369 (diff) | |
download | cffi-42b970f5cf60781d3b41916bb99b958c6bac53d8.tar.gz |
Trying out a "fast csv" module that compiles specialized versions
of the parsing function...
Diffstat (limited to 'demo')
-rw-r--r-- | demo/fastcsv.py | 266 |
1 files changed, 266 insertions, 0 deletions
diff --git a/demo/fastcsv.py b/demo/fastcsv.py new file mode 100644 index 0000000..7f04107 --- /dev/null +++ b/demo/fastcsv.py @@ -0,0 +1,266 @@ +import csv +import cffi + +# IN-PROGRESS. See the demo at the end of the file + + +dialect2ffi = {} + +def _make_ffi_from_dialect(dialect): + + ffi = cffi.FFI() + + ffi.cdef(""" + long parse_line(char *rawline, long inputlength); + """) + + d = {'quotechar': ord(dialect.quotechar), + 'quoting': int(dialect.quoting), + 'skipinitialspace': int(dialect.skipinitialspace), + 'delimiter': ord(dialect.delimiter), + 'doublequote': int(dialect.doublequote), + 'strict': int(dialect.strict), + } + if dialect.escapechar is not None: + d['is_escape_char'] = '== %d' % ord(dialect.escapechar) + else: + d['is_escape_char'] = '&& 0' + + lib = ffi.verify(r''' + + typedef enum { + START_RECORD, START_FIELD, ESCAPED_CHAR, IN_FIELD, + IN_QUOTED_FIELD, ESCAPE_IN_QUOTED_FIELD, QUOTE_IN_QUOTED_FIELD, + EAT_CRNL + } ParserState; + + typedef enum { + QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE + } QuoteStyle; + + typedef struct { + ParserState state; /* current CSV parse state */ + char *field; /* build current field in here */ + int field_size; /* size of allocated buffer */ + int field_len; /* length of current field */ + int numeric_field; /* treat field as numeric */ + } ReaderObj; + + static void + parse_add_char(ReaderObj *self, char c) + { + *self->field++ = c; + } + + static void + parse_save_field(ReaderObj *self) + { + *self->field++ = 0; + } + + static int + parse_process_char(ReaderObj *self, char c) + { + switch (self->state) { + case START_RECORD: + /* start of record */ + if (c == '\0') + /* empty line - return [] */ + break; + else if (c == '\n' || c == '\r') { + self->state = EAT_CRNL; + break; + } + /* normal character - handle as START_FIELD */ + self->state = START_FIELD; + /* fallthru */ + case START_FIELD: + /* expecting field */ + if (c == '\n' || c == '\r' || c == '\0') { + /* save empty field - return [fields] */ + parse_save_field(self); + self->state = (c == '\0' ? START_RECORD : EAT_CRNL); + } + else if (c == %(quotechar)d && + %(quoting)d != QUOTE_NONE) { + /* start quoted field */ + self->state = IN_QUOTED_FIELD; + } + else if (c %(is_escape_char)s) { + /* possible escaped character */ + self->state = ESCAPED_CHAR; + } + else if (c == ' ' && %(skipinitialspace)d) + /* ignore space at start of field */ + ; + else if (c == %(delimiter)d) { + /* save empty field */ + parse_save_field(self); + } + else { + /* begin new unquoted field */ + if (%(quoting)d == QUOTE_NONNUMERIC) + self->numeric_field = 1; + parse_add_char(self, c); + self->state = IN_FIELD; + } + break; + + case ESCAPED_CHAR: + if (c == '\0') + c = '\n'; + parse_add_char(self, c); + self->state = IN_FIELD; + break; + + case IN_FIELD: + /* in unquoted field */ + if (c == '\n' || c == '\r' || c == '\0') { + /* end of line - return [fields] */ + parse_save_field(self); + self->state = (c == '\0' ? START_RECORD : EAT_CRNL); + } + else if (c %(is_escape_char)s) { + /* possible escaped character */ + self->state = ESCAPED_CHAR; + } + else if (c == %(delimiter)d) { + /* save field - wait for new field */ + parse_save_field(self); + self->state = START_FIELD; + } + else { + /* normal character - save in field */ + parse_add_char(self, c); + } + break; + + case IN_QUOTED_FIELD: + /* in quoted field */ + if (c == '\0') + ; + else if (c %(is_escape_char)s) { + /* Possible escape character */ + self->state = ESCAPE_IN_QUOTED_FIELD; + } + else if (c == %(quotechar)d && + %(quoting)d != QUOTE_NONE) { + if (%(doublequote)d) { + /* doublequote; " represented by "" */ + self->state = QUOTE_IN_QUOTED_FIELD; + } + else { + /* end of quote part of field */ + self->state = IN_FIELD; + } + } + else { + /* normal character - save in field */ + parse_add_char(self, c); + } + break; + + case ESCAPE_IN_QUOTED_FIELD: + if (c == '\0') + c = '\n'; + parse_add_char(self, c); + self->state = IN_QUOTED_FIELD; + break; + + case QUOTE_IN_QUOTED_FIELD: + /* doublequote - seen a quote in an quoted field */ + if (%(quoting)d != QUOTE_NONE && + c == %(quotechar)d) { + /* save "" as " */ + parse_add_char(self, c); + self->state = IN_QUOTED_FIELD; + } + else if (c == %(delimiter)d) { + /* save field - wait for new field */ + parse_save_field(self); + self->state = START_FIELD; + } + else if (c == '\n' || c == '\r' || c == '\0') { + /* end of line - return [fields] */ + parse_save_field(self); + self->state = (c == '\0' ? START_RECORD : EAT_CRNL); + } + else if (!%(strict)d) { + parse_add_char(self, c); + self->state = IN_FIELD; + } + else { + /* illegal */ + /*PyErr_Format(error_obj, "'%%c' expected after '%%c'", + dialect->delimiter, + dialect->quotechar);*/ + return -1; + } + break; + + case EAT_CRNL: + if (c == '\n' || c == '\r') + ; + else if (c == '\0') + self->state = START_RECORD; + else { + /*PyErr_Format(error_obj, "new-line character seen in unquoted field - do you need to open the file in universal-newline mode?");*/ + return -1; + } + break; + + } + return 0; + } + + static void + parse_reset(ReaderObj *self, char *rawline) + { + self->field = rawline; + self->state = START_RECORD; + self->numeric_field = 0; + } + + long parse_line(char *rawline, long inputlength) + { + char *p; + ReaderObj reader; + parse_reset(&reader, rawline); + + for (p=rawline; inputlength > 0; inputlength--, p++) { + if (parse_process_char(&reader, *p) < 0) + return -1; + } + if (parse_process_char(&reader, 0) < 0) + return -1; + return reader.field - rawline - 1; + } + ''' % d) + + return ffi, lib + + +def fastcsv_reader(f, dialect): + dialect = csv.get_dialect(dialect) + try: + ffi, lib = dialect2ffi[dialect] + except KeyError: + ffi, lib = dialect2ffi[dialect] = _make_ffi_from_dialect(dialect) + # + linelen = -1 + for line in f: + if linelen <= len(line): + linelen = 2 * len(line) + rawline = ffi.new("char[]", linelen) + ffi.buffer(rawline, len(line))[:] = line + n = lib.parse_line(rawline, len(line)) + assert n >= 0 + yield ffi.buffer(rawline, n)[:].split('\x00') + + +if __name__ == '__main__': + csv.register_dialect('unixpwd', delimiter=':', quoting=csv.QUOTE_NONE) + with open('/etc/passwd', 'rb') as f: + reader = fastcsv_reader(f, 'unixpwd') + for row in reader: + print row |