From fdbc7686266f80e1e8a3a5e914dec6819dd74380 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 20 Apr 2016 23:30:23 +0100 Subject: [PATCH 1/3] DOC: Add documentation for delim_whitespace Addresses DOC issue part of gh-12912. --- doc/source/io.rst | 6 ++++++ pandas/io/parsers.py | 5 +++++ 2 files changed, 11 insertions(+) diff --git a/doc/source/io.rst b/doc/source/io.rst index 351a7059b2739..25925ef4a8b91 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -97,6 +97,12 @@ sep : str, defaults to ``','`` for :func:`read_csv`, ``\t`` for :func:`read_tabl Regex example: ``'\\r\\t'``. delimiter : str, default ``None`` Alternative argument name for sep. +delim_whitespace : boolean, default False + Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) + will be used as the delimiter. Equivalent to setting ``sep='\+s'``. + If this option is set to True, nothing should be passed in for the + ``delimiter`` parameter. This parameter is currently supported for + the C parser only. Column and Index Locations and Names ++++++++++++++++++++++++++++++++++++ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index e08268a1944b7..4ece66122bcd0 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -209,6 +209,11 @@ warn_bad_lines : boolean, default True If error_bad_lines is False, and warn_bad_lines is True, a warning for each "bad line" will be output. (Only valid with C parser). +delim_whitespace : boolean, default False + Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be used + as the delimiter. Equivalent to setting ``sep='\+s'``. If this option is set + to True, nothing should be passed in for the ``delimiter`` parameter. This + parameter is currently supported for the C parser only. Returns ------- From 62d626072689fcf76008e814ce62f82f7c209554 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 21 Apr 2016 00:01:10 +0100 Subject: [PATCH 2/3] BUG: Parse custom terminator with whitespace delimiter Addresses BUG issue part of gh-12912. Closes gh-12912. --- doc/source/whatsnew/v0.18.1.txt | 1 + pandas/io/tests/test_parsers.py | 9 ++ pandas/src/parser/tokenizer.c | 259 +++++++++++++++++++++++++++++++- 3 files changed, 265 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 821f093083026..d386f32d35195 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -302,6 +302,7 @@ Bug Fixes - Bug in ``value_counts`` when ``normalize=True`` and ``dropna=True`` where nulls still contributed to the normalized count (:issue:`12558`) - Bug in ``Panel.fillna()`` ignoring ``inplace=True`` (:issue:`12633`) - Bug in ``read_csv`` when specifying ``names``, ```usecols``, and ``parse_dates`` simultaneously with the C engine (:issue:`9755`) +- Bug in ``read_csv`` when specifying ``delim_whitespace=True`` and ``lineterminator`` simultaneously with the C engine (:issue:`12912`) - Bug in ``Series.rename``, ``DataFrame.rename`` and ``DataFrame.rename_axis`` not treating ``Series`` as mappings to relabel (:issue:`12623`). - Clean in ``.rolling.min`` and ``.rolling.max`` to enhance dtype handling (:issue:`12373`) - Bug in ``groupby`` where complex types are coerced to float (:issue:`12902`) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index ab6103f0f523c..1fab316d80ae6 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -3878,6 +3878,15 @@ def test_buffer_rd_bytes(self): except Exception as e: pass + def test_delim_whitespace_custom_terminator(self): + # See gh-12912 + data = """a b c~1 2 3~4 5 6~7 8 9""" + df = self.read_csv(StringIO(data), lineterminator='~', + delim_whitespace=True) + expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=['a', 'b', 'c']) + tm.assert_frame_equal(df, expected) + class TestCParserHighMemory(CParserTests, CompressionTests, tm.TestCase): engine = 'c' diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index a75ce2bde80e6..060dba820ea8d 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -1641,6 +1641,251 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) return 0; } +// custom line terminator +int tokenize_whitespace_customterm(parser_t *self, size_t line_limit) +{ + int i, slen, start_lines; + long maxstreamsize; + char c; + char *stream; + char *buf = self->data + self->datapos; + + start_lines = self->lines; + + if (make_stream_space(self, self->datalen - self->datapos) < 0) { + self->error_msg = "out of memory"; + return -1; + } + + stream = self->stream + self->stream_len; + slen = self->stream_len; + maxstreamsize = self->stream_cap; + + TRACE(("%s\n", buf)); + + for (i = self->datapos; i < self->datalen; ++i) + { + // next character in file + c = *buf++; + + TRACE(("tokenize_whitespace_customterm - Iter: %d Char: %c Line %d field_count %d, state %d\n", + i, c, self->file_lines + 1, self->line_fields[self->lines], + self->state)); + + switch(self->state) { + + case SKIP_LINE: + if (c == self->lineterminator) { + END_LINE(); + } + break; + + case WHITESPACE_LINE: + if (c == self->lineterminator) { + self->file_lines++; + self->state = START_RECORD; + break; + } + // fall through + + case EAT_WHITESPACE: + if (c == self->lineterminator) { + END_LINE(); + self->state = START_RECORD; + break; + } else if (!IS_WHITESPACE(c)) { + self->state = START_FIELD; + // fall through to subsequent state + } else { + // if whitespace char, keep slurping + break; + } + + case START_RECORD: + // start of record + if (skip_this_line(self, self->file_lines)) { + self->state = SKIP_LINE; + if (c == self->lineterminator) { + END_LINE(); + } + break; + } else if (c == self->lineterminator) { + if (self->skip_empty_lines) { + self->file_lines++; + } else { + END_LINE(); + } + break; + } else if (IS_WHITESPACE(c)) { + if (self->skip_empty_lines) + self->state = WHITESPACE_LINE; + else + self->state = EAT_WHITESPACE; + break; + } else if (c == self->commentchar) { + self->state = EAT_LINE_COMMENT; + break; + } else { + // nominal character - handle as START_FIELD + self->state = START_FIELD; + } + // fall through + + case START_FIELD: + // expecting field + if (c == self->lineterminator) { + END_FIELD(); + END_LINE(); + } else if (c == self->quotechar && + self->quoting != QUOTE_NONE) { + // start quote field + self->state = IN_QUOTED_FIELD; + } else if (c == self->escapechar) { + // possible escaped character + self->state = ESCAPED_CHAR; + } else if (IS_WHITESPACE(c)) { + self->state = EAT_WHITESPACE; + } else if (c == self->commentchar) { + END_FIELD(); + self->state = EAT_COMMENT; + } else { + // begin new unquoted field + if (self->quoting == QUOTE_NONNUMERIC) + self->numeric_field = 1; + + PUSH_CHAR(c); + self->state = IN_FIELD; + } + break; + + case EAT_LINE_COMMENT: + if (c == self->lineterminator) { + self->file_lines++; + self->state = START_RECORD; + } + break; + + case ESCAPED_CHAR: + PUSH_CHAR(c); + self->state = IN_FIELD; + break; + + case IN_FIELD: + // in unquoted field + if (c == self->lineterminator) { + END_FIELD(); + END_LINE(); + } else if (c == self->escapechar) { + // possible escaped character + self->state = ESCAPED_CHAR; + } else if (IS_WHITESPACE(c)) { + // end of field (end of line not reached yet) + END_FIELD(); + self->state = EAT_WHITESPACE; + } else if (c == self->commentchar) { + END_FIELD(); + self->state = EAT_COMMENT; + } else { + // normal character - save in field + PUSH_CHAR(c); + } + break; + + case IN_QUOTED_FIELD: + // in quoted field + if (c == self->escapechar) { + // possible escape character + self->state = ESCAPE_IN_QUOTED_FIELD; + } else if (c == self->quotechar && + self->quoting != QUOTE_NONE) { + if (self->doublequote) { + // double quote - " represented by "" + self->state = QUOTE_IN_QUOTED_FIELD; + } + else { + // end of quote part of field + self->state = IN_FIELD; + } + } else { + // normal character - save in field + PUSH_CHAR(c); + } + break; + + case ESCAPE_IN_QUOTED_FIELD: + PUSH_CHAR(c); + self->state = IN_QUOTED_FIELD; + break; + + case QUOTE_IN_QUOTED_FIELD: + // double quote - seen a quote in an quoted field + if (self->quoting != QUOTE_NONE && c == self->quotechar) { + // save "" as " + PUSH_CHAR(c); + self->state = IN_QUOTED_FIELD; + } else if (IS_WHITESPACE(c)) { + // end of field (end of line not reached yet) + END_FIELD(); + self->state = EAT_WHITESPACE; + } else if (c == self->lineterminator) { + END_FIELD(); + END_LINE(); + } else if (!self->strict) { + PUSH_CHAR(c); + self->state = IN_FIELD; + } else { + self->error_msg = (char*) malloc(50); + sprintf(self->error_msg, "'%c' expected after '%c'", + self->delimiter, self->quotechar); + goto parsingerror; + } + break; + + case EAT_CRNL: + if (c == self->lineterminator) { + END_LINE(); + } else if (IS_WHITESPACE(c)){ + // Handle \r-delimited files + END_LINE_STATE(EAT_WHITESPACE); + } else { + /* XXX + * first character of a new record--need to back up and reread + * to handle properly... + */ + i--; buf--; // back up one character (HACK!) + END_LINE_STATE(START_RECORD); + } + break; + + case EAT_COMMENT: + if (c == self->lineterminator) { + END_LINE(); + } + break; + + default: + break; + } + } + + _TOKEN_CLEANUP(); + + TRACE(("Finished tokenizing input\n")) + + return 0; + +parsingerror: + i++; + _TOKEN_CLEANUP(); + + return -1; + +linelimit: + i++; + _TOKEN_CLEANUP(); + + return 0; +} static int parser_handle_eof(parser_t *self) { TRACE(("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state)) @@ -1851,11 +2096,17 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) { int start_lines = self->lines; if (self->delim_whitespace) { - tokenize_bytes = tokenize_whitespace; - } else if (self->lineterminator == '\0') { - tokenize_bytes = tokenize_delimited; + if (self->lineterminator == '\0') { + tokenize_bytes = tokenize_whitespace; + } else { + tokenize_bytes = tokenize_whitespace_customterm; + } } else { - tokenize_bytes = tokenize_delim_customterm; + if (self->lineterminator == '\0') { + tokenize_bytes = tokenize_delimited; + } else { + tokenize_bytes = tokenize_delim_customterm; + } } if (self->state == FINISHED) { From 78cf922c64a8ba4295a220ca9f77c36889635217 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 21 Apr 2016 14:18:57 +0100 Subject: [PATCH 3/3] MAINT: Refactor C engine tokenizing --- pandas/src/parser/tokenizer.c | 1211 ++++++--------------------------- 1 file changed, 194 insertions(+), 1017 deletions(-) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 060dba820ea8d..013c47cd09a9b 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -693,956 +693,38 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { #define IS_WHITESPACE(c) ((c == ' ' || c == '\t')) -typedef int (*parser_op)(parser_t *self, size_t line_limit); +#define IS_TERMINATOR(c) ((self->lineterminator == '\0' && \ + c == '\n') || c == self->lineterminator) -#define _TOKEN_CLEANUP() \ - self->stream_len = slen; \ - self->datapos = i; \ - TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, self->datalen)); - - -int skip_this_line(parser_t *self, int64_t rownum) { - if (self->skipset != NULL) { - return ( kh_get_int64((kh_int64_t*) self->skipset, self->file_lines) != - ((kh_int64_t*)self->skipset)->n_buckets ); - } - else { - return ( rownum <= self->skip_first_N_rows ); - } -} - -int tokenize_delimited(parser_t *self, size_t line_limit) -{ - int i, slen, start_lines; - long maxstreamsize; - char c; - char *stream; - char *buf = self->data + self->datapos; - - - start_lines = self->lines; - - if (make_stream_space(self, self->datalen - self->datapos) < 0) { - self->error_msg = "out of memory"; - return -1; - } - - stream = self->stream + self->stream_len; - slen = self->stream_len; - maxstreamsize = self->stream_cap; - TRACE(("%s\n", buf)); - - for (i = self->datapos; i < self->datalen; ++i) - { - // Next character in file - c = *buf++; - - TRACE(("tokenize_delimited - Iter: %d Char: 0x%x Line %d field_count %d, state %d\n", - i, c, self->file_lines + 1, self->line_fields[self->lines], - self->state)); - - switch(self->state) { - - case SKIP_LINE: - TRACE(("tokenize_delimited SKIP_LINE 0x%x, state %d\n", c, self->state)); - if (c == '\n') { - END_LINE(); - } else if (c == '\r') { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } - break; - - case START_RECORD: - // start of record - if (skip_this_line(self, self->file_lines)) { - self->state = SKIP_LINE; - if (c == '\n') { - END_LINE(); - } - break; - } - else if (c == '\n') { - // \n\r possible? - if (self->skip_empty_lines) - { - self->file_lines++; - } - else - { - END_LINE(); - } - break; - } - else if (c == '\r') { - if (self->skip_empty_lines) - { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } - else - self->state = EAT_CRNL; - break; - } - else if (c == self->commentchar) { - self->state = EAT_LINE_COMMENT; - break; - } - else if (IS_WHITESPACE(c) && c != self->delimiter && self->skip_empty_lines) { - self->state = WHITESPACE_LINE; - break; - } - - /* normal character - handle as START_FIELD */ - self->state = START_FIELD; - /* fallthru */ - - case START_FIELD: - /* expecting field */ - if (c == '\n') { - END_FIELD(); - END_LINE(); - } else if (c == '\r') { - END_FIELD(); - self->state = EAT_CRNL; - } - else if (c == self->quotechar && - self->quoting != QUOTE_NONE) { - /* start quoted field */ - self->state = IN_QUOTED_FIELD; - } - else if (c == self->escapechar) { - /* possible escaped character */ - self->state = ESCAPED_CHAR; - } - else if (c == ' ' && self->skipinitialspace) - /* ignore space at start of field */ - ; - else if (c == self->delimiter) { - /* save empty field */ - END_FIELD(); - } - else if (c == self->commentchar) { - END_FIELD(); - self->state = EAT_COMMENT; - } - else { - /* begin new unquoted field */ -// if (self->quoting == QUOTE_NONNUMERIC) -// self->numeric_field = 1; - - // TRACE(("pushing %c", c)); - PUSH_CHAR(c); - self->state = IN_FIELD; - } - break; - - case WHITESPACE_LINE: // check if line is whitespace-only - if (c == '\n') { - self->file_lines++; - self->state = START_RECORD; // ignore empty line - } - else if (c == '\r') { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } - else if (IS_WHITESPACE(c) && c != self->delimiter) - ; - else { // backtrack - /* We have to use i + 1 because buf has been incremented but not i */ - do { - --buf; - --i; - } while (i + 1 > self->datapos && *buf != '\n'); - - if (*buf == '\n') // reached a newline rather than the beginning - { - ++buf; // move pointer to first char after newline - ++i; - } - self->state = START_FIELD; - } - break; - - case ESCAPED_CHAR: - /* if (c == '\0') */ - /* c = '\n'; */ - - PUSH_CHAR(c); - self->state = IN_FIELD; - break; - - case EAT_LINE_COMMENT: - if (c == '\n') { - self->file_lines++; - self->state = START_RECORD; - } else if (c == '\r') { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } - break; - - case IN_FIELD: - /* in unquoted field */ - if (c == '\n') { - END_FIELD(); - END_LINE(); - /* self->state = START_RECORD; */ - } else if (c == '\r') { - END_FIELD(); - self->state = EAT_CRNL; - } - else if (c == self->escapechar) { - /* possible escaped character */ - self->state = ESCAPED_CHAR; - } - else if (c == self->delimiter) { - // End of field. End of line not reached yet - END_FIELD(); - self->state = START_FIELD; - } - else if (c == self->commentchar) { - END_FIELD(); - self->state = EAT_COMMENT; - } - else { - /* normal character - save in field */ - PUSH_CHAR(c); - } - break; - - case IN_QUOTED_FIELD: - /* in quoted field */ - if (c == self->escapechar) { - /* Possible escape character */ - self->state = ESCAPE_IN_QUOTED_FIELD; - } - else if (c == self->quotechar && - self->quoting != QUOTE_NONE) { - if (self->doublequote) { - /* doublequote; " represented by "" */ - self->state = QUOTE_IN_QUOTED_FIELD; - } - else { - /* end of quote part of field */ - self->state = IN_FIELD; - } - } - else { - /* normal character - save in field */ - PUSH_CHAR(c); - } - break; - - case ESCAPE_IN_QUOTED_FIELD: - /* if (c == '\0') */ - /* c = '\n'; */ - - PUSH_CHAR(c); - self->state = IN_QUOTED_FIELD; - break; - - case QUOTE_IN_QUOTED_FIELD: - /* doublequote - seen a quote in an quoted field */ - if (self->quoting != QUOTE_NONE && c == self->quotechar) { - /* save "" as " */ - - PUSH_CHAR(c); - self->state = IN_QUOTED_FIELD; - } - else if (c == self->delimiter) { - // End of field. End of line not reached yet - - END_FIELD(); - self->state = START_FIELD; - } - else if (c == '\n') { - END_FIELD(); - END_LINE(); - /* self->state = START_RECORD; */ - } - else if (c == '\r') { - END_FIELD(); - self->state = EAT_CRNL; - } - else if (!self->strict) { - PUSH_CHAR(c); - self->state = IN_FIELD; - } - else { - self->error_msg = (char*) malloc(50); - sprintf(self->error_msg, "'%c' expected after '%c'", - self->delimiter, self->quotechar); - goto parsingerror; - } - break; - - case EAT_COMMENT: - if (c == '\n') { - END_LINE(); - } else if (c == '\r') { - self->state = EAT_CRNL; - } - break; - - case EAT_CRNL: - if (c == '\n') { - END_LINE(); - /* self->state = START_RECORD; */ - } else if (c == self->delimiter){ - // Handle \r-delimited files - END_LINE_AND_FIELD_STATE(START_FIELD); - } else { - /* \r line terminator */ - - /* UGH. we don't actually want to consume the token. fix this later */ - self->stream_len = slen; - if (end_line(self) < 0) { - goto parsingerror; - } - stream = self->stream + self->stream_len; - slen = self->stream_len; - self->state = START_RECORD; - - /* HACK, let's try this one again */ - --i; buf--; - if (line_limit > 0 && self->lines == start_lines + line_limit) { - goto linelimit; - } - - } - break; - - case EAT_CRNL_NOP: /* inside an ignored comment line */ - self->state = START_RECORD; - /* \r line terminator -- parse this character again */ - if (c != '\n' && c != self->delimiter) { - --i; - --buf; - } - break; - default: - break; - - } - } - - _TOKEN_CLEANUP(); - - TRACE(("Finished tokenizing input\n")) - - return 0; - -parsingerror: - i++; - _TOKEN_CLEANUP(); - - return -1; - -linelimit: - i++; - _TOKEN_CLEANUP(); - - return 0; -} - -/* custom line terminator */ -int tokenize_delim_customterm(parser_t *self, size_t line_limit) -{ - - int i, slen, start_lines; - long maxstreamsize; - char c; - char *stream; - char *buf = self->data + self->datapos; - - - start_lines = self->lines; - - if (make_stream_space(self, self->datalen - self->datapos) < 0) { - self->error_msg = "out of memory"; - return -1; - } - - stream = self->stream + self->stream_len; - slen = self->stream_len; - maxstreamsize = self->stream_cap; - - TRACE(("%s\n", buf)); - - for (i = self->datapos; i < self->datalen; ++i) - { - // Next character in file - c = *buf++; - - TRACE(("tokenize_delim_customterm - Iter: %d Char: %c Line %d field_count %d, state %d\n", - i, c, self->file_lines + 1, self->line_fields[self->lines], - self->state)); - - switch(self->state) { - - case SKIP_LINE: -// TRACE(("tokenize_delim_customterm SKIP_LINE %c, state %d\n", c, self->state)); - if (c == self->lineterminator) { - END_LINE(); - } - break; - - case START_RECORD: - // start of record - if (skip_this_line(self, self->file_lines)) { - self->state = SKIP_LINE; - if (c == self->lineterminator) { - END_LINE(); - } - break; - } - else if (c == self->lineterminator) { - // \n\r possible? - if (self->skip_empty_lines) - { - self->file_lines++; - } - else - { - END_LINE(); - } - break; - } - else if (c == self->commentchar) { - self->state = EAT_LINE_COMMENT; - break; - } - else if (IS_WHITESPACE(c) && c != self->delimiter && self->skip_empty_lines) - { - self->state = WHITESPACE_LINE; - break; - } - /* normal character - handle as START_FIELD */ - self->state = START_FIELD; - /* fallthru */ - case START_FIELD: - /* expecting field */ - if (c == self->lineterminator) { - END_FIELD(); - END_LINE(); - /* self->state = START_RECORD; */ - } - else if (c == self->quotechar && - self->quoting != QUOTE_NONE) { - /* start quoted field */ - self->state = IN_QUOTED_FIELD; - } - else if (c == self->escapechar) { - /* possible escaped character */ - self->state = ESCAPED_CHAR; - } - else if (c == ' ' && self->skipinitialspace) - /* ignore space at start of field */ - ; - else if (c == self->delimiter) { - /* save empty field */ - END_FIELD(); - } - else if (c == self->commentchar) { - END_FIELD(); - self->state = EAT_COMMENT; - } - else { - /* begin new unquoted field */ - if (self->quoting == QUOTE_NONNUMERIC) - self->numeric_field = 1; - - // TRACE(("pushing %c", c)); - PUSH_CHAR(c); - self->state = IN_FIELD; - } - break; - - case WHITESPACE_LINE: // check if line is whitespace-only - if (c == self->lineterminator) { - self->file_lines++; - self->state = START_RECORD; // ignore empty line - } - else if (IS_WHITESPACE(c) && c != self->delimiter) - ; - else { // backtrack - /* We have to use i + 1 because buf has been incremented but not i */ - do { - --buf; - --i; - } while (i + 1 > self->datapos && *buf != self->lineterminator); - - if (*buf == self->lineterminator) // reached a newline rather than the beginning - { - ++buf; // move pointer to first char after newline - ++i; - } - self->state = START_FIELD; - } - break; - - case ESCAPED_CHAR: - /* if (c == '\0') */ - /* c = '\n'; */ - - PUSH_CHAR(c); - self->state = IN_FIELD; - break; - - case IN_FIELD: - /* in unquoted field */ - if (c == self->lineterminator) { - END_FIELD(); - END_LINE(); - /* self->state = START_RECORD; */ - } - else if (c == self->escapechar) { - /* possible escaped character */ - self->state = ESCAPED_CHAR; - } - else if (c == self->delimiter) { - // End of field. End of line not reached yet - END_FIELD(); - self->state = START_FIELD; - } - else if (c == self->commentchar) { - END_FIELD(); - self->state = EAT_COMMENT; - } - else { - /* normal character - save in field */ - PUSH_CHAR(c); - } - break; - - case IN_QUOTED_FIELD: - /* in quoted field */ - if (c == self->escapechar) { - /* Possible escape character */ - self->state = ESCAPE_IN_QUOTED_FIELD; - } - else if (c == self->quotechar && - self->quoting != QUOTE_NONE) { - if (self->doublequote) { - /* doublequote; " represented by "" */ - self->state = QUOTE_IN_QUOTED_FIELD; - } - else { - /* end of quote part of field */ - self->state = IN_FIELD; - } - } - else { - /* normal character - save in field */ - PUSH_CHAR(c); - } - break; - - case ESCAPE_IN_QUOTED_FIELD: - PUSH_CHAR(c); - self->state = IN_QUOTED_FIELD; - break; - - case QUOTE_IN_QUOTED_FIELD: - /* doublequote - seen a quote in an quoted field */ - if (self->quoting != QUOTE_NONE && c == self->quotechar) { - /* save "" as " */ - - PUSH_CHAR(c); - self->state = IN_QUOTED_FIELD; - } - else if (c == self->delimiter) { - // End of field. End of line not reached yet - - END_FIELD(); - self->state = START_FIELD; - } - else if (c == self->lineterminator) { - END_FIELD(); - END_LINE(); - /* self->state = START_RECORD; */ - } - else if (!self->strict) { - PUSH_CHAR(c); - self->state = IN_FIELD; - } - else { - self->error_msg = (char*) malloc(50); - sprintf(self->error_msg, "'%c' expected after '%c'", - self->delimiter, self->quotechar); - goto parsingerror; - } - break; - - case EAT_LINE_COMMENT: - if (c == self->lineterminator) { - self->file_lines++; - self->state = START_RECORD; - } - break; - - case EAT_COMMENT: - if (c == self->lineterminator) { - END_LINE(); - } - break; - - default: - break; - - } - } - - _TOKEN_CLEANUP(); - - TRACE(("Finished tokenizing input\n")) - - return 0; - -parsingerror: - i++; - _TOKEN_CLEANUP(); - - return -1; - -linelimit: - i++; - _TOKEN_CLEANUP(); - - return 0; -} - -int tokenize_whitespace(parser_t *self, size_t line_limit) -{ - int i, slen, start_lines; - long maxstreamsize; - char c; - char *stream; - char *buf = self->data + self->datapos; - - start_lines = self->lines; - - if (make_stream_space(self, self->datalen - self->datapos) < 0) { - self->error_msg = "out of memory"; - return -1; - } - - stream = self->stream + self->stream_len; - slen = self->stream_len; - maxstreamsize = self->stream_cap; - - TRACE(("%s\n", buf)); - - for (i = self->datapos; i < self->datalen; ++i) - { - // Next character in file - c = *buf++; - - TRACE(("tokenize_whitespace - Iter: %d Char: %c Line %d field_count %d, state %d\n", - i, c, self->file_lines + 1, self->line_fields[self->lines], - self->state)); - - switch(self->state) { - - case SKIP_LINE: -// TRACE(("tokenize_whitespace SKIP_LINE %c, state %d\n", c, self->state)); - if (c == '\n') { - END_LINE(); - } else if (c == '\r') { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } - break; - - case WHITESPACE_LINE: - if (c == '\n') { - self->file_lines++; - self->state = START_RECORD; - break; - } - else if (c == '\r') { - self->file_lines++; - self->state = EAT_CRNL_NOP; - break; - } - // fall through - - case EAT_WHITESPACE: - if (c == '\n') { - END_LINE(); - self->state = START_RECORD; - break; - } else if (c == '\r') { - self->state = EAT_CRNL; - break; - } else if (!IS_WHITESPACE(c)) { - // END_FIELD(); - self->state = START_FIELD; - // Fall through to subsequent state - } else { - // if whitespace char, keep slurping - break; - } - - case START_RECORD: - // start of record - if (skip_this_line(self, self->file_lines)) { - self->state = SKIP_LINE; - if (c == '\n') { - END_LINE(); - } - break; - } else if (c == '\n') { - if (self->skip_empty_lines) - // \n\r possible? - { - self->file_lines++; - } - else - { - END_LINE(); - } - break; - } else if (c == '\r') { - if (self->skip_empty_lines) - { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } - else - self->state = EAT_CRNL; - break; - } else if (IS_WHITESPACE(c)) { - if (self->skip_empty_lines) - self->state = WHITESPACE_LINE; - else - self->state = EAT_WHITESPACE; - break; - } else if (c == self->commentchar) { - self->state = EAT_LINE_COMMENT; - break; - } else { - /* normal character - handle as START_FIELD */ - self->state = START_FIELD; - } - /* fallthru */ - case START_FIELD: - /* expecting field */ - if (c == '\n') { - END_FIELD(); - END_LINE(); - /* self->state = START_RECORD; */ - } else if (c == '\r') { - END_FIELD(); - self->state = EAT_CRNL; - } - else if (c == self->quotechar && - self->quoting != QUOTE_NONE) { - /* start quoted field */ - self->state = IN_QUOTED_FIELD; - } - else if (c == self->escapechar) { - /* possible escaped character */ - self->state = ESCAPED_CHAR; - } - /* else if (c == ' ' && self->skipinitialspace) */ - /* /\* ignore space at start of field *\/ */ - /* ; */ - else if (IS_WHITESPACE(c)) { - self->state = EAT_WHITESPACE; - } - else if (c == self->commentchar) { - END_FIELD(); - self->state = EAT_COMMENT; - } - else { - /* begin new unquoted field */ - if (self->quoting == QUOTE_NONNUMERIC) - self->numeric_field = 1; - - // TRACE(("pushing %c", c)); - PUSH_CHAR(c); - self->state = IN_FIELD; - } - break; - - case EAT_LINE_COMMENT: - if (c == '\n') { - self->file_lines++; - self->state = START_RECORD; - } else if (c == '\r') { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } - break; - - case ESCAPED_CHAR: - /* if (c == '\0') */ - /* c = '\n'; */ - - PUSH_CHAR(c); - self->state = IN_FIELD; - break; - - case IN_FIELD: - /* in unquoted field */ - if (c == '\n') { - END_FIELD(); - END_LINE(); - /* self->state = START_RECORD; */ - } else if (c == '\r') { - END_FIELD(); - self->state = EAT_CRNL; - } - else if (c == self->escapechar) { - /* possible escaped character */ - self->state = ESCAPED_CHAR; - } - else if (IS_WHITESPACE(c)) { - // End of field. End of line not reached yet - END_FIELD(); - self->state = EAT_WHITESPACE; - } - else if (c == self->commentchar) { - END_FIELD(); - self->state = EAT_COMMENT; - } - else { - /* normal character - save in field */ - PUSH_CHAR(c); - } - break; - - case IN_QUOTED_FIELD: - /* in quoted field */ - if (c == self->escapechar) { - /* Possible escape character */ - self->state = ESCAPE_IN_QUOTED_FIELD; - } - else if (c == self->quotechar && - self->quoting != QUOTE_NONE) { - if (self->doublequote) { - /* doublequote; " represented by "" */ - self->state = QUOTE_IN_QUOTED_FIELD; - } - else { - /* end of quote part of field */ - self->state = IN_FIELD; - } - } - else { - /* normal character - save in field */ - PUSH_CHAR(c); - } - break; - - case ESCAPE_IN_QUOTED_FIELD: - /* if (c == '\0') */ - /* c = '\n'; */ - - PUSH_CHAR(c); - self->state = IN_QUOTED_FIELD; - break; - - case QUOTE_IN_QUOTED_FIELD: - /* doublequote - seen a quote in an quoted field */ - if (self->quoting != QUOTE_NONE && c == self->quotechar) { - /* save "" as " */ - - PUSH_CHAR(c); - self->state = IN_QUOTED_FIELD; - } - else if (IS_WHITESPACE(c)) { - // End of field. End of line not reached yet - - END_FIELD(); - self->state = EAT_WHITESPACE; - } - else if (c == '\n') { - END_FIELD(); - END_LINE(); - /* self->state = START_RECORD; */ - } - else if (c == '\r') { - END_FIELD(); - self->state = EAT_CRNL; - } - else if (!self->strict) { - PUSH_CHAR(c); - self->state = IN_FIELD; - } - else { - self->error_msg = (char*) malloc(50); - sprintf(self->error_msg, "'%c' expected after '%c'", - self->delimiter, self->quotechar); - goto parsingerror; - } - break; - - case EAT_CRNL: - if (c == '\n') { - END_LINE(); - /* self->state = START_RECORD; */ - } else if (IS_WHITESPACE(c)){ - // Handle \r-delimited files - END_LINE_STATE(EAT_WHITESPACE); - } else { - /* XXX - * first character of a new record--need to back up and reread - * to handle properly... - */ - i--; buf--; /* back up one character (HACK!) */ - END_LINE_STATE(START_RECORD); - } - break; +#define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE)) - case EAT_CRNL_NOP: // inside an ignored comment line - self->state = START_RECORD; - /* \r line terminator -- parse this character again */ - if (c != '\n' && c != self->delimiter) { - --i; - --buf; - } - break; +// don't parse '\r' with a custom line terminator +#define IS_CARRIAGE(c) ((self->lineterminator == '\0' && c == '\r')) - case EAT_COMMENT: - if (c == '\n') { - END_LINE(); - } else if (c == '\r') { - self->state = EAT_CRNL; - } - break; +#define IS_SKIPPABLE_SPACE(c) ((!self->delim_whitespace && c == ' ' && \ + self->skipinitialspace)) - default: - break; +// applied when in a field +#define IS_DELIMITER(c) ((!self->delim_whitespace && c == self->delimiter) || \ + (self->delim_whitespace && IS_WHITESPACE(c))) +#define _TOKEN_CLEANUP() \ + self->stream_len = slen; \ + self->datapos = i; \ + TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, self->datalen)); - } +int skip_this_line(parser_t *self, int64_t rownum) { + if (self->skipset != NULL) { + return ( kh_get_int64((kh_int64_t*) self->skipset, self->file_lines) != + ((kh_int64_t*)self->skipset)->n_buckets ); + } + else { + return ( rownum <= self->skip_first_N_rows ); } - - _TOKEN_CLEANUP(); - - TRACE(("Finished tokenizing input\n")) - - return 0; - -parsingerror: - i++; - _TOKEN_CLEANUP(); - - return -1; - -linelimit: - i++; - _TOKEN_CLEANUP(); - - return 0; } -// custom line terminator -int tokenize_whitespace_customterm(parser_t *self, size_t line_limit) +int tokenize_bytes(parser_t *self, size_t line_limit) { int i, slen, start_lines; long maxstreamsize; @@ -1668,31 +750,60 @@ int tokenize_whitespace_customterm(parser_t *self, size_t line_limit) // next character in file c = *buf++; - TRACE(("tokenize_whitespace_customterm - Iter: %d Char: %c Line %d field_count %d, state %d\n", + TRACE(("tokenize_bytes - Iter: %d Char: 0x%x Line %d field_count %d, state %d\n", i, c, self->file_lines + 1, self->line_fields[self->lines], self->state)); switch(self->state) { case SKIP_LINE: - if (c == self->lineterminator) { + TRACE(("tokenize_bytes SKIP_LINE 0x%x, state %d\n", c, self->state)); + if (IS_TERMINATOR(c)) { END_LINE(); + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; } break; case WHITESPACE_LINE: - if (c == self->lineterminator) { + if (IS_TERMINATOR(c)) { self->file_lines++; self->state = START_RECORD; break; + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + break; + } else if (!self->delim_whitespace) { + if (IS_WHITESPACE(c) && c != self->delimiter) { + ; + } else { // backtrack + // use i + 1 because buf has been incremented but not i + do { + --buf; + --i; + } while (i + 1 > self->datapos && !IS_TERMINATOR(*buf)); + + // reached a newline rather than the beginning + if (IS_TERMINATOR(*buf)) { + ++buf; // move pointer to first char after newline + ++i; + } + self->state = START_FIELD; + } + break; } // fall through case EAT_WHITESPACE: - if (c == self->lineterminator) { + if (IS_TERMINATOR(c)) { END_LINE(); self->state = START_RECORD; break; + } else if (IS_CARRIAGE(c)) { + self->state = EAT_CRNL; + break; } else if (!IS_WHITESPACE(c)) { self->state = START_FIELD; // fall through to subsequent state @@ -1705,83 +816,122 @@ int tokenize_whitespace_customterm(parser_t *self, size_t line_limit) // start of record if (skip_this_line(self, self->file_lines)) { self->state = SKIP_LINE; - if (c == self->lineterminator) { + if (IS_TERMINATOR(c)) { END_LINE(); } break; - } else if (c == self->lineterminator) { + } else if (IS_TERMINATOR(c)) { + // \n\r possible? if (self->skip_empty_lines) { self->file_lines++; } else { END_LINE(); } break; - } else if (IS_WHITESPACE(c)) { - if (self->skip_empty_lines) - self->state = WHITESPACE_LINE; - else - self->state = EAT_WHITESPACE; + } else if (IS_CARRIAGE(c)) { + if (self->skip_empty_lines) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } else { + self->state = EAT_CRNL; + } break; } else if (c == self->commentchar) { self->state = EAT_LINE_COMMENT; break; - } else { - // nominal character - handle as START_FIELD - self->state = START_FIELD; + } else if (IS_WHITESPACE(c)) { + if (self->delim_whitespace) { + if (self->skip_empty_lines) { + self->state = WHITESPACE_LINE; + } else { + self->state = EAT_WHITESPACE; + } + break; + } else if (c != self->delimiter && self->skip_empty_lines) { + self->state = WHITESPACE_LINE; + break; + } + // fall through } - // fall through + + // normal character - fall through + // to handle as START_FIELD + self->state = START_FIELD; case START_FIELD: // expecting field - if (c == self->lineterminator) { + if (IS_TERMINATOR(c)) { END_FIELD(); END_LINE(); - } else if (c == self->quotechar && - self->quoting != QUOTE_NONE) { - // start quote field + } else if (IS_CARRIAGE(c)) { + END_FIELD(); + self->state = EAT_CRNL; + } else if (IS_QUOTE(c)) { + // start quoted field self->state = IN_QUOTED_FIELD; } else if (c == self->escapechar) { // possible escaped character self->state = ESCAPED_CHAR; - } else if (IS_WHITESPACE(c)) { - self->state = EAT_WHITESPACE; + } else if (IS_SKIPPABLE_SPACE(c)) { + // ignore space at start of field + ; + } else if (IS_DELIMITER(c)) { + if (self->delim_whitespace) { + self->state = EAT_WHITESPACE; + } else { + // save empty field + END_FIELD(); + } } else if (c == self->commentchar) { END_FIELD(); self->state = EAT_COMMENT; } else { // begin new unquoted field - if (self->quoting == QUOTE_NONNUMERIC) - self->numeric_field = 1; + // if (self->delim_whitespace && \ + // self->quoting == QUOTE_NONNUMERIC) { + // self->numeric_field = 1; + // } PUSH_CHAR(c); self->state = IN_FIELD; } break; + case ESCAPED_CHAR: + PUSH_CHAR(c); + self->state = IN_FIELD; + break; + case EAT_LINE_COMMENT: - if (c == self->lineterminator) { + if (IS_TERMINATOR(c)) { self->file_lines++; self->state = START_RECORD; + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; } break; - case ESCAPED_CHAR: - PUSH_CHAR(c); - self->state = IN_FIELD; - break; - case IN_FIELD: // in unquoted field - if (c == self->lineterminator) { + if (IS_TERMINATOR(c)) { END_FIELD(); END_LINE(); + } else if (IS_CARRIAGE(c)) { + END_FIELD(); + self->state = EAT_CRNL; } else if (c == self->escapechar) { // possible escaped character self->state = ESCAPED_CHAR; - } else if (IS_WHITESPACE(c)) { - // end of field (end of line not reached yet) + } else if (IS_DELIMITER(c)) { + // end of field - end of line not reached yet END_FIELD(); - self->state = EAT_WHITESPACE; + + if (self->delim_whitespace) { + self->state = EAT_WHITESPACE; + } else { + self->state = START_FIELD; + } } else if (c == self->commentchar) { END_FIELD(); self->state = EAT_COMMENT; @@ -1796,13 +946,11 @@ int tokenize_whitespace_customterm(parser_t *self, size_t line_limit) if (c == self->escapechar) { // possible escape character self->state = ESCAPE_IN_QUOTED_FIELD; - } else if (c == self->quotechar && - self->quoting != QUOTE_NONE) { + } else if (IS_QUOTE(c)) { if (self->doublequote) { // double quote - " represented by "" self->state = QUOTE_IN_QUOTED_FIELD; - } - else { + } else { // end of quote part of field self->state = IN_FIELD; } @@ -1819,50 +967,98 @@ int tokenize_whitespace_customterm(parser_t *self, size_t line_limit) case QUOTE_IN_QUOTED_FIELD: // double quote - seen a quote in an quoted field - if (self->quoting != QUOTE_NONE && c == self->quotechar) { + if (IS_QUOTE(c)) { // save "" as " + PUSH_CHAR(c); self->state = IN_QUOTED_FIELD; - } else if (IS_WHITESPACE(c)) { - // end of field (end of line not reached yet) + } else if (IS_DELIMITER(c)) { + // end of field - end of line not reached yet END_FIELD(); - self->state = EAT_WHITESPACE; - } else if (c == self->lineterminator) { + + if (self->delim_whitespace) { + self->state = EAT_WHITESPACE; + } else { + self->state = START_FIELD; + } + } else if (IS_TERMINATOR(c)) { END_FIELD(); END_LINE(); + } else if (IS_CARRIAGE(c)) { + END_FIELD(); + self->state = EAT_CRNL; } else if (!self->strict) { PUSH_CHAR(c); self->state = IN_FIELD; } else { self->error_msg = (char*) malloc(50); - sprintf(self->error_msg, "'%c' expected after '%c'", - self->delimiter, self->quotechar); + sprintf(self->error_msg, + "delimiter expected after " + "quote in quote"); goto parsingerror; } break; - case EAT_CRNL: - if (c == self->lineterminator) { + case EAT_COMMENT: + if (IS_TERMINATOR(c)) { END_LINE(); - } else if (IS_WHITESPACE(c)){ - // Handle \r-delimited files - END_LINE_STATE(EAT_WHITESPACE); - } else { - /* XXX - * first character of a new record--need to back up and reread - * to handle properly... - */ - i--; buf--; // back up one character (HACK!) - END_LINE_STATE(START_RECORD); + } else if (IS_CARRIAGE(c)) { + self->state = EAT_CRNL; } break; - case EAT_COMMENT: - if (c == self->lineterminator) { + // only occurs with non-custom line terminator, + // which is why we directly check for '\n' + case EAT_CRNL: + if (c == '\n') { END_LINE(); + } else if (IS_DELIMITER(c)){ + + if (self->delim_whitespace) { + END_LINE_STATE(EAT_WHITESPACE); + } else { + // Handle \r-delimited files + END_LINE_AND_FIELD_STATE(START_FIELD); + } + } else { + if (self->delim_whitespace) { + /* XXX + * first character of a new record--need to back up and reread + * to handle properly... + */ + i--; buf--; // back up one character (HACK!) + END_LINE_STATE(START_RECORD); + } else { + // \r line terminator + // UGH. we don't actually want + // to consume the token. fix this later + self->stream_len = slen; + if (end_line(self) < 0) { + goto parsingerror; + } + + stream = self->stream + self->stream_len; + slen = self->stream_len; + self->state = START_RECORD; + + --i; buf--; // let's try this character again (HACK!) + if (line_limit > 0 && self->lines == start_lines + line_limit) { + goto linelimit; + } + } } break; + // only occurs with non-custom line terminator, + // which is why we directly check for '\n' + case EAT_CRNL_NOP: // inside an ignored comment line + self->state = START_RECORD; + // \r line terminator -- parse this character again + if (c != '\n' && !IS_DELIMITER(c)) { + --i; + --buf; + } + break; default: break; } @@ -2090,25 +1286,9 @@ void debug_print_parser(parser_t *self) { */ int _tokenize_helper(parser_t *self, size_t nrows, int all) { - parser_op tokenize_bytes; - int status = 0; int start_lines = self->lines; - if (self->delim_whitespace) { - if (self->lineterminator == '\0') { - tokenize_bytes = tokenize_whitespace; - } else { - tokenize_bytes = tokenize_whitespace_customterm; - } - } else { - if (self->lineterminator == '\0') { - tokenize_bytes = tokenize_delimited; - } else { - tokenize_bytes = tokenize_delim_customterm; - } - } - if (self->state == FINISHED) { return 0; } @@ -2135,12 +1315,9 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) { TRACE(("_tokenize_helper: Trying to process %d bytes, datalen=%d, datapos= %d\n", self->datalen - self->datapos, self->datalen, self->datapos)); - /* TRACE(("sourcetype: %c, status: %d\n", self->sourcetype, status)); */ status = tokenize_bytes(self, nrows); - /* debug_print_parser(self); */ - if (status < 0) { // XXX TRACE(("_tokenize_helper: Status %d returned from tokenize_bytes, breaking\n",