import csv import cffi # IN-PROGRESS. See the demo at the end of the file dialect2ffi = {} def _make_ffi_from_dialect(dialect): ffi = cffi.FFI() ffi.cdef(""" long parse_line(char *rawline, long inputlength); """) d = {'quotechar': ord(dialect.quotechar), 'quoting': int(dialect.quoting), 'skipinitialspace': int(dialect.skipinitialspace), 'delimiter': ord(dialect.delimiter), 'doublequote': int(dialect.doublequote), 'strict': int(dialect.strict), } if dialect.escapechar is not None: d['is_escape_char'] = '== %d' % ord(dialect.escapechar) else: d['is_escape_char'] = '&& 0' lib = ffi.verify(r''' typedef enum { START_RECORD, START_FIELD, ESCAPED_CHAR, IN_FIELD, IN_QUOTED_FIELD, ESCAPE_IN_QUOTED_FIELD, QUOTE_IN_QUOTED_FIELD, EAT_CRNL } ParserState; typedef enum { QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE } QuoteStyle; typedef struct { ParserState state; /* current CSV parse state */ char *field; /* build current field in here */ int field_size; /* size of allocated buffer */ int field_len; /* length of current field */ int numeric_field; /* treat field as numeric */ } ReaderObj; static void parse_add_char(ReaderObj *self, char c) { *self->field++ = c; } static void parse_save_field(ReaderObj *self) { *self->field++ = 0; } static int parse_process_char(ReaderObj *self, char c) { switch (self->state) { case START_RECORD: /* start of record */ if (c == '\0') /* empty line - return [] */ break; else if (c == '\n' || c == '\r') { self->state = EAT_CRNL; break; } /* normal character - handle as START_FIELD */ self->state = START_FIELD; /* fallthru */ case START_FIELD: /* expecting field */ if (c == '\n' || c == '\r' || c == '\0') { /* save empty field - return [fields] */ parse_save_field(self); self->state = (c == '\0' ? START_RECORD : EAT_CRNL); } else if (c == %(quotechar)d && %(quoting)d != QUOTE_NONE) { /* start quoted field */ self->state = IN_QUOTED_FIELD; } else if (c %(is_escape_char)s) { /* possible escaped character */ self->state = ESCAPED_CHAR; } else if (c == ' ' && %(skipinitialspace)d) /* ignore space at start of field */ ; else if (c == %(delimiter)d) { /* save empty field */ parse_save_field(self); } else { /* begin new unquoted field */ if (%(quoting)d == QUOTE_NONNUMERIC) self->numeric_field = 1; parse_add_char(self, c); self->state = IN_FIELD; } break; case ESCAPED_CHAR: if (c == '\0') c = '\n'; parse_add_char(self, c); self->state = IN_FIELD; break; case IN_FIELD: /* in unquoted field */ if (c == '\n' || c == '\r' || c == '\0') { /* end of line - return [fields] */ parse_save_field(self); self->state = (c == '\0' ? START_RECORD : EAT_CRNL); } else if (c %(is_escape_char)s) { /* possible escaped character */ self->state = ESCAPED_CHAR; } else if (c == %(delimiter)d) { /* save field - wait for new field */ parse_save_field(self); self->state = START_FIELD; } else { /* normal character - save in field */ parse_add_char(self, c); } break; case IN_QUOTED_FIELD: /* in quoted field */ if (c == '\0') ; else if (c %(is_escape_char)s) { /* Possible escape character */ self->state = ESCAPE_IN_QUOTED_FIELD; } else if (c == %(quotechar)d && %(quoting)d != QUOTE_NONE) { if (%(doublequote)d) { /* doublequote; " represented by "" */ self->state = QUOTE_IN_QUOTED_FIELD; } else { /* end of quote part of field */ self->state = IN_FIELD; } } else { /* normal character - save in field */ parse_add_char(self, c); } break; case ESCAPE_IN_QUOTED_FIELD: if (c == '\0') c = '\n'; parse_add_char(self, c); self->state = IN_QUOTED_FIELD; break; case QUOTE_IN_QUOTED_FIELD: /* doublequote - seen a quote in an quoted field */ if (%(quoting)d != QUOTE_NONE && c == %(quotechar)d) { /* save "" as " */ parse_add_char(self, c); self->state = IN_QUOTED_FIELD; } else if (c == %(delimiter)d) { /* save field - wait for new field */ parse_save_field(self); self->state = START_FIELD; } else if (c == '\n' || c == '\r' || c == '\0') { /* end of line - return [fields] */ parse_save_field(self); self->state = (c == '\0' ? START_RECORD : EAT_CRNL); } else if (!%(strict)d) { parse_add_char(self, c); self->state = IN_FIELD; } else { /* illegal */ /*PyErr_Format(error_obj, "'%%c' expected after '%%c'", dialect->delimiter, dialect->quotechar);*/ return -1; } break; case EAT_CRNL: if (c == '\n' || c == '\r') ; else if (c == '\0') self->state = START_RECORD; else { /*PyErr_Format(error_obj, "new-line character seen in unquoted field - do you need to open the file in universal-newline mode?");*/ return -1; } break; } return 0; } static void parse_reset(ReaderObj *self, char *rawline) { self->field = rawline; self->state = START_RECORD; self->numeric_field = 0; } long parse_line(char *rawline, long inputlength) { char *p; ReaderObj reader; parse_reset(&reader, rawline); for (p=rawline; inputlength > 0; inputlength--, p++) { if (parse_process_char(&reader, *p) < 0) return -1; } if (parse_process_char(&reader, 0) < 0) return -1; return reader.field - rawline - 1; } ''' % d) return ffi, lib def fastcsv_reader(f, dialect): dialect = csv.get_dialect(dialect) try: ffi, lib = dialect2ffi[dialect] except KeyError: ffi, lib = dialect2ffi[dialect] = _make_ffi_from_dialect(dialect) # linelen = -1 for line in f: if linelen <= len(line): linelen = 2 * len(line) rawline = ffi.new("char[]", linelen) ffi.buffer(rawline, len(line))[:] = line n = lib.parse_line(rawline, len(line)) assert n >= 0 yield ffi.buffer(rawline, n)[:].split('\x00') if __name__ == '__main__': csv.register_dialect('unixpwd', delimiter=':', quoting=csv.QUOTE_NONE) with open('/etc/passwd', 'rb') as f: reader = fastcsv_reader(f, 'unixpwd') for row in reader: print row