From d15e3a3317d2406830f058abcd1a1c8be54ea559 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 3 Jan 2017 23:49:23 -0800 Subject: [PATCH] ENH: Accept callable for skiprows --- doc/source/io.rst | 10 ++++++++++ doc/source/whatsnew/v0.20.0.txt | 1 + pandas/io/parsers.py | 28 ++++++++++++++++++++-------- pandas/io/tests/parser/skiprows.py | 25 +++++++++++++++++++++++++ pandas/parser.pyx | 26 ++++++++++++++++++++++---- pandas/src/parser/tokenizer.c | 30 ++++++++++++++++++++++++++++-- pandas/src/parser/tokenizer.h | 1 + 7 files changed, 107 insertions(+), 14 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index dae97f7bc7f34..9f5e6f2331bc5 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -187,6 +187,16 @@ skipinitialspace : boolean, default ``False`` skiprows : list-like or integer, default ``None`` Line numbers to skip (0-indexed) or number of lines to skip (int) at the start of the file. + + If callable, the callable function will be evaluated against the row + indices, returning True if the row should be skipped and False otherwise: + + .. ipython:: python + + data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' + pd.read_csv(StringIO(data)) + pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0) + skipfooter : int, default ``0`` Number of lines at bottom of file to skip (unsupported with engine='c'). skip_footer : int, default ``0`` diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 70fddea3fe1a9..eb3c6c40682ed 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -110,6 +110,7 @@ Other enhancements - ``pd.qcut`` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`7751`) - ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`) - The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`) +- The ``skiprows`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`10882`) - ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`) - ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`) - ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f2c3113fc2cdd..fdf26fdef6b25 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -132,9 +132,13 @@ Values to consider as False skipinitialspace : boolean, default False Skip spaces after delimiter. -skiprows : list-like or integer, default None +skiprows : list-like or integer or callable, default None Line numbers to skip (0-indexed) or number of lines to skip (int) - at the start of the file + at the start of the file. + + If callable, the callable function will be evaluated against the row + indices, returning True if the row should be skipped and False otherwise. + An example of a valid callable argument would be ``lambda x: x in [0, 2]``. skipfooter : int, default 0 Number of lines at bottom of file to skip (Unsupported with engine='c') skip_footer : int, default 0 @@ -930,7 +934,10 @@ def _clean_options(self, options, engine): if engine != 'c': if is_integer(skiprows): skiprows = lrange(skiprows) - skiprows = set() if skiprows is None else set(skiprows) + if skiprows is None: + skiprows = set() + elif not callable(skiprows): + skiprows = set(skiprows) # put stuff back result['names'] = names @@ -1851,6 +1858,11 @@ def __init__(self, f, **kwds): self.memory_map = kwds['memory_map'] self.skiprows = kwds['skiprows'] + if callable(self.skiprows): + self.skipfunc = self.skiprows + else: + self.skipfunc = lambda x: x in self.skiprows + self.skipfooter = kwds['skipfooter'] self.delimiter = kwds['delimiter'] @@ -2006,7 +2018,7 @@ class MyDialect(csv.Dialect): # attempt to sniff the delimiter if sniff_sep: line = f.readline() - while self.pos in self.skiprows: + while self.skipfunc(self.pos): self.pos += 1 line = f.readline() @@ -2414,7 +2426,7 @@ def _empty(self, line): def _next_line(self): if isinstance(self.data, list): - while self.pos in self.skiprows: + while self.skipfunc(self.pos): self.pos += 1 while True: @@ -2433,7 +2445,7 @@ def _next_line(self): except IndexError: raise StopIteration else: - while self.pos in self.skiprows: + while self.skipfunc(self.pos): self.pos += 1 next(self.data) @@ -2685,7 +2697,7 @@ def _get_lines(self, rows=None): # Check for stop rows. n.b.: self.skiprows is a set. if self.skiprows: new_rows = [row for i, row in enumerate(new_rows) - if i + self.pos not in self.skiprows] + if not self.skipfunc(i + self.pos)] lines.extend(new_rows) self.pos = new_pos @@ -2713,7 +2725,7 @@ def _get_lines(self, rows=None): except StopIteration: if self.skiprows: new_rows = [row for i, row in enumerate(new_rows) - if self.pos + i not in self.skiprows] + if not self.skipfunc(i + self.pos)] lines.extend(new_rows) if len(lines) == 0: raise diff --git a/pandas/io/tests/parser/skiprows.py b/pandas/io/tests/parser/skiprows.py index 9f01adb6fabcb..c53e6a1579267 100644 --- a/pandas/io/tests/parser/skiprows.py +++ b/pandas/io/tests/parser/skiprows.py @@ -12,6 +12,7 @@ import pandas.util.testing as tm from pandas import DataFrame +from pandas.io.common import EmptyDataError from pandas.compat import StringIO, range, lrange @@ -198,3 +199,27 @@ def test_skiprows_infield_quote(self): df = self.read_csv(StringIO(data), skiprows=2) tm.assert_frame_equal(df, expected) + + def test_skiprows_callable(self): + data = 'a\n1\n2\n3\n4\n5' + + skiprows = lambda x: x % 2 == 0 + expected = DataFrame({'1': [3, 5]}) + df = self.read_csv(StringIO(data), skiprows=skiprows) + tm.assert_frame_equal(df, expected) + + expected = DataFrame({'foo': [3, 5]}) + df = self.read_csv(StringIO(data), skiprows=skiprows, + header=0, names=['foo']) + tm.assert_frame_equal(df, expected) + + skiprows = lambda x: True + msg = "No columns to parse from file" + with tm.assertRaisesRegexp(EmptyDataError, msg): + self.read_csv(StringIO(data), skiprows=skiprows) + + # This is a bad callable and should raise. + msg = "by zero" + skiprows = lambda x: 1 / 0 + with tm.assertRaisesRegexp(ZeroDivisionError, msg): + self.read_csv(StringIO(data), skiprows=skiprows) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 7b31f7fe27c1e..bd793c98eef5b 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -178,6 +178,7 @@ cdef extern from "parser/tokenizer.h": int header_end # header row end void *skipset + PyObject *skipfunc int64_t skip_first_N_rows int skipfooter double (*converter)(const char *, char **, char, char, char, int) nogil @@ -606,9 +607,11 @@ cdef class TextReader: cdef _make_skiprow_set(self): if isinstance(self.skiprows, (int, np.integer)): parser_set_skipfirstnrows(self.parser, self.skiprows) - else: + elif not callable(self.skiprows): for i in self.skiprows: parser_add_skiprow(self.parser, i) + else: + self.parser.skipfunc = self.skiprows cdef _setup_parser_source(self, source): cdef: @@ -2115,18 +2118,33 @@ cdef kh_float64_t* kset_float64_from_list(values) except NULL: cdef raise_parser_error(object base, parser_t *parser): cdef: object old_exc + object exc_type PyObject *type PyObject *value PyObject *traceback if PyErr_Occurred(): - PyErr_Fetch(&type, &value, &traceback); - Py_XDECREF(type) + PyErr_Fetch(&type, &value, &traceback) Py_XDECREF(traceback) + if value != NULL: old_exc = value Py_XDECREF(value) - raise old_exc + + # PyErr_Fetch only returned the error message in *value, + # so the Exception class must be extracted from *type. + if isinstance(old_exc, compat.string_types): + if type != NULL: + exc_type = type + else: + exc_type = ParserError + + Py_XDECREF(type) + raise exc_type(old_exc) + else: + Py_XDECREF(type) + raise old_exc + message = '%s. C error: ' % base if parser.error_msg != NULL: if PY3: diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index bc729cd3e7453..87e17fe5fb751 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -124,6 +124,7 @@ void parser_set_default_options(parser_t *self) { self->thousands = '\0'; self->skipset = NULL; + self->skipfunc = NULL; self->skip_first_N_rows = -1; self->skip_footer = 0; } @@ -679,7 +680,27 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { } int skip_this_line(parser_t *self, int64_t rownum) { - if (self->skipset != NULL) { + int should_skip; + PyObject *result; + PyGILState_STATE state; + + if (self->skipfunc != NULL) { + state = PyGILState_Ensure(); + result = PyObject_CallFunction(self->skipfunc, "i", rownum); + + // Error occurred. It will be processed + // and caught at the Cython level. + if (result == NULL) { + should_skip = -1; + } else { + should_skip = PyObject_IsTrue(result); + } + + Py_XDECREF(result); + PyGILState_Release(state); + + return should_skip; + } else if (self->skipset != NULL) { return (kh_get_int64((kh_int64_t *)self->skipset, self->file_lines) != ((kh_int64_t *)self->skipset)->n_buckets); } else { @@ -689,6 +710,7 @@ int skip_this_line(parser_t *self, int64_t rownum) { int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) { int i, slen; + int should_skip; long maxstreamsize; char c; char *stream; @@ -818,7 +840,11 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) { case START_RECORD: // start of record - if (skip_this_line(self, self->file_lines)) { + should_skip = skip_this_line(self, self->file_lines); + + if (should_skip == -1) { + goto parsingerror; + } else if (should_skip) { if (IS_QUOTE(c)) { self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; } else { diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index d31bf4b688c58..e7271cabb0752 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -198,6 +198,7 @@ typedef struct parser_t { int header_end; // header row end void *skipset; + PyObject *skipfunc; int64_t skip_first_N_rows; int skip_footer; double (*converter)(const char *, char **, char, char, char, int);