From caec3c52501c3152959fddbc5bb19a809109d667 Mon Sep 17 00:00:00 2001 From: skojoian Date: Tue, 12 Mar 2019 03:31:41 -0700 Subject: [PATCH 1/6] BUG: Raise ValueError if a column index in usecols is out of bounds. #25623 --- doc/source/whatsnew/v0.24.2.rst | 35 ++------------------------ doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/io/parsers.py | 18 ++++++++++++- pandas/tests/io/parser/test_usecols.py | 14 +++++++++++ 4 files changed, 34 insertions(+), 35 deletions(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 8da33a46e79c6..2c6d1e01ed89b 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -32,7 +32,6 @@ Fixed Regressions - Fixed regression in creating a period-dtype array from a read-only NumPy array of period objects. (:issue:`25403`) - Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`) - Fixed pip installing from source into an environment without NumPy (:issue:`25193`) -- Fixed regression in :meth:`DataFrame.to_csv` writing duplicate line endings with gzip compress (:issue:`25311`) .. _whatsnew_0242.enhancements: @@ -102,41 +101,11 @@ Bug Fixes - Bug in :meth:`Series.is_unique` where single occurrences of ``NaN`` were not considered unique (:issue:`25180`) - Bug in :func:`merge` when merging an empty ``DataFrame`` with an ``Int64`` column or a non-empty ``DataFrame`` with an ``Int64`` column that is all ``NaN`` (:issue:`25183`) - Bug in ``IntervalTree`` where a ``RecursionError`` occurs upon construction due to an overflow when adding endpoints, which also causes :class:`IntervalIndex` to crash during indexing operations (:issue:`25485`) -- Bug in :attr:`Series.size` raising for some extension-array-backed ``Series``, rather than returning the size (:issue:`25580`) -- Bug in resampling raising for nullable integer-dtype columns (:issue:`25580`) +- .. _whatsnew_0242.contributors: Contributors ~~~~~~~~~~~~ -.. Including the contributors hardcoded for this release, as backporting with - MeeseeksDev loses the commit authors - -A total of 25 people contributed patches to this release. People with a "+" by their names contributed a patch for the first time. - -* Albert Villanova del Moral -* Arno Veenstra + -* chris-b1 -* Devin Petersohn + -* EternalLearner42 + -* Flavien Lambert + -* gfyoung -* Gioia Ballin -* jbrockmendel -* Jeff Reback -* Jeremy Schendel -* Johan von Forstner + -* Joris Van den Bossche -* Josh -* Justin Zheng -* Matthew Roeschke -* Max Bolingbroke + -* rbenes + -* Sterling Paramore + -* Tao He + -* Thomas A Caswell -* Tom Augspurger -* Vibhu Agarwal + -* William Ayd -* Zach Angell +.. contributors:: v0.24.1..v0.24.2 diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 284943cf49070..feafb3cf006e8 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -122,7 +122,7 @@ Bug Fixes ~~~~~~~~~ - Bug in :func:`to_datetime` which would raise an (incorrect) ``ValueError`` when called with a date far into the future and the ``format`` argument specified instead of raising ``OutOfBoundsDatetime`` (:issue:`23830`) - Bug in an error message in :meth:`DataFrame.plot`. Improved the error message if non-numerics are passed to :meth:`DataFrame.plot` (:issue:`25481`) -- +- Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`) Categorical ^^^^^^^^^^^ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 4163a571df800..f1b133db8a5ef 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1894,6 +1894,11 @@ def __init__(self, src, **kwds): not set(usecols).issubset(self.orig_names)): _validate_usecols_names(usecols, self.orig_names) + # GH 25623 + elif self.usecols_dtype == 'integer': + indices = lrange(self._reader.table_width) + _validate_usecols_names(usecols, indices) + if len(self.names) > len(usecols): self.names = [n for i, n in enumerate(self.names) if (i in usecols or n in usecols)] @@ -2197,7 +2202,8 @@ def __init__(self, f, **kwds): self.skipinitialspace = kwds['skipinitialspace'] self.lineterminator = kwds['lineterminator'] self.quoting = kwds['quoting'] - self.usecols, _ = _validate_usecols_arg(kwds['usecols']) + self.usecols, self.usecols_dtype = _validate_usecols_arg( + kwds['usecols']) self.skip_blank_lines = kwds['skip_blank_lines'] self.warn_bad_lines = kwds['warn_bad_lines'] @@ -2588,6 +2594,12 @@ def _infer_columns(self): if clear_buffer: self._clear_buffer() + # GH 25623 + if self.usecols_dtype == 'integer': + for col in columns: + indices = lrange(len(col)) + _validate_usecols_names(self.usecols, indices) + if names is not None: if ((self.usecols is not None and len(names) != len(self.usecols)) or @@ -2623,6 +2635,10 @@ def _infer_columns(self): ncols = len(line) num_original_columns = ncols + # GH25623 + if self.usecols_dtype == 'integer': + _validate_usecols_names(self.usecols, lrange(ncols)) + if not names: if self.prefix: columns = [['%s%d' % (self.prefix, i) diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py index 652f78d198ee8..91993641201e2 100644 --- a/pandas/tests/io/parser/test_usecols.py +++ b/pandas/tests/io/parser/test_usecols.py @@ -21,6 +21,20 @@ "expected but not found: {0}") +@pytest.mark.parametrize("names,usecols", [ + (None, [0, 3]), + (["a", "b", "c"], [0, -1, 2]), + (None, [3]), + (["a"], [3]) +]) +def test_usecols_out_of_bounds(all_parsers, names, usecols): + data = "a,b,c\n1,2,3\n4,5,6" + parser = all_parsers + + with pytest.raises(ValueError, match=_msg_validate_usecols_names): + parser.read_csv(StringIO(data), usecols=usecols, names=names) + + def test_raise_on_mixed_dtype_usecols(all_parsers): # See gh-12678 data = """a,b,c From 10126e1836e4b35afa44897e730a2b13f1f046a9 Mon Sep 17 00:00:00 2001 From: skojoian Date: Tue, 12 Mar 2019 03:31:41 -0700 Subject: [PATCH 2/6] BUG: Raise ValueError if a column index in usecols is out of bounds. #25623 --- doc/source/whatsnew/v0.24.2.rst | 44 +-------------------------------- 1 file changed, 1 insertion(+), 43 deletions(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 2c6d1e01ed89b..5fcf6775d522e 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -18,7 +18,7 @@ including other versions of pandas. .. _whatsnew_0242.regressions: Fixed Regressions -^^^^^^^^^^^^^^^^^ +~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`DataFrame.all` and :meth:`DataFrame.any` where ``bool_only=True`` was ignored (:issue:`25101`) - Fixed issue in ``DataFrame`` construction with passing a mixed list of mixed types could segfault. (:issue:`25075`) @@ -33,68 +33,26 @@ Fixed Regressions - Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`) - Fixed pip installing from source into an environment without NumPy (:issue:`25193`) -.. _whatsnew_0242.enhancements: - -Enhancements -^^^^^^^^^^^^ - -- -- - .. _whatsnew_0242.bug_fixes: Bug Fixes ~~~~~~~~~ -**Conversion** - -- -- -- - -**Indexing** - -- -- -- - **I/O** - Better handling of terminal printing when the terminal dimensions are not known (:issue:`25080`) - Bug in reading a HDF5 table-format ``DataFrame`` created in Python 2, in Python 3 (:issue:`24925`) - Bug in reading a JSON with ``orient='table'`` generated by :meth:`DataFrame.to_json` with ``index=False`` (:issue:`25170`) - Bug where float indexes could have misaligned values when printing (:issue:`25061`) -- - -**Categorical** - -- -- -- - -**Timezones** - -- -- -- - -**Timedelta** - -- -- -- **Reshaping** - Bug in :meth:`~pandas.core.groupby.GroupBy.transform` where applying a function to a timezone aware column would return a timezone naive result (:issue:`24198`) - Bug in :func:`DataFrame.join` when joining on a timezone aware :class:`DatetimeIndex` (:issue:`23931`) -- **Visualization** - Bug in :meth:`Series.plot` where a secondary y axis could not be set to log scale (:issue:`25545`) -- -- **Other** From 4eb55beec63be7644cf985ef10acc7c7843c3acd Mon Sep 17 00:00:00 2001 From: skojoian Date: Tue, 12 Mar 2019 20:28:03 -0700 Subject: [PATCH 3/6] update v0.24.2.rst --- doc/source/whatsnew/v0.24.2.rst | 43 ++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 5fcf6775d522e..6ad299de45e2a 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -2,8 +2,8 @@ .. _whatsnew_0242: -Whats New in 0.24.2 (February XX, 2019) ---------------------------------------- +Whats New in 0.24.2 (March 12, 2019) +------------------------------------ .. warning:: @@ -31,7 +31,11 @@ Fixed Regressions - Fixed regression in ``IntervalDtype`` construction where passing an incorrect string with 'Interval' as a prefix could result in a ``RecursionError``. (:issue:`25338`) - Fixed regression in creating a period-dtype array from a read-only NumPy array of period objects. (:issue:`25403`) - Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`) +- Fixed regression in :func:`to_timedelta` losing precision when converting floating data to ``Timedelta`` data (:issue:`25077`). - Fixed pip installing from source into an environment without NumPy (:issue:`25193`) +- Fixed regression in :meth:`DataFrame.replace` where large strings of numbers would be coerced into ``int64``, causing an ``OverflowError`` (:issue:`25616`) +- Fixed regression in :func:`factorize` when passing a custom ``na_sentinel`` value with ``sort=True`` (:issue:`25409`). +- Fixed regression in :meth:`DataFrame.to_csv` writing duplicate line endings with gzip compress (:issue:`25311`) .. _whatsnew_0242.bug_fixes: @@ -59,11 +63,42 @@ Bug Fixes - Bug in :meth:`Series.is_unique` where single occurrences of ``NaN`` were not considered unique (:issue:`25180`) - Bug in :func:`merge` when merging an empty ``DataFrame`` with an ``Int64`` column or a non-empty ``DataFrame`` with an ``Int64`` column that is all ``NaN`` (:issue:`25183`) - Bug in ``IntervalTree`` where a ``RecursionError`` occurs upon construction due to an overflow when adding endpoints, which also causes :class:`IntervalIndex` to crash during indexing operations (:issue:`25485`) -- +- Bug in :attr:`Series.size` raising for some extension-array-backed ``Series``, rather than returning the size (:issue:`25580`) +- Bug in resampling raising for nullable integer-dtype columns (:issue:`25580`) .. _whatsnew_0242.contributors: Contributors ~~~~~~~~~~~~ -.. contributors:: v0.24.1..v0.24.2 +.. Including the contributors hardcoded for this release, as backporting with + MeeseeksDev loses the commit authors + +A total of 25 people contributed patches to this release. People with a "+" by their names contributed a patch for the first time. + +* Albert Villanova del Moral +* Arno Veenstra + +* chris-b1 +* Devin Petersohn + +* EternalLearner42 + +* Flavien Lambert + +* gfyoung +* Gioia Ballin +* jbrockmendel +* Jeff Reback +* Jeremy Schendel +* Johan von Forstner + +* Joris Van den Bossche +* Josh +* Justin Zheng +* Kendall Masse +* Matthew Roeschke +* Max Bolingbroke + +* rbenes + +* Sterling Paramore + +* Tao He + +* Thomas A Caswell +* Tom Augspurger +* Vibhu Agarwal + +* William Ayd +* Zach Angell From 2d22a93ab965618d96d55eced7f4fd19b0804481 Mon Sep 17 00:00:00 2001 From: skojoian Date: Sat, 23 Mar 2019 13:58:49 -0700 Subject: [PATCH 4/6] use string fromatting with _msg_validate_usecols_names to specify missing columns. --- pandas/tests/io/parser/test_usecols.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py index 91993641201e2..0454ec9fc0ef2 100644 --- a/pandas/tests/io/parser/test_usecols.py +++ b/pandas/tests/io/parser/test_usecols.py @@ -21,17 +21,18 @@ "expected but not found: {0}") -@pytest.mark.parametrize("names,usecols", [ - (None, [0, 3]), - (["a", "b", "c"], [0, -1, 2]), - (None, [3]), - (["a"], [3]) +@pytest.mark.parametrize("names,usecols,missing", [ + (None, [0, 3], r"\[3\]"), + (["a", "b", "c"], [0, -1, 2], r"\[-1\]"), + (None, [3], r"\[3\]"), + (["a"], [3], r"\[3\]") ]) -def test_usecols_out_of_bounds(all_parsers, names, usecols): +def test_usecols_out_of_bounds(all_parsers, names, usecols, missing): data = "a,b,c\n1,2,3\n4,5,6" parser = all_parsers - - with pytest.raises(ValueError, match=_msg_validate_usecols_names): + + mssg = _msg_validate_usecols_names.format(missing) + with pytest.raises(ValueError, match=mssg): parser.read_csv(StringIO(data), usecols=usecols, names=names) From 2d8835d7d60c8077d24ccd0c4bc4414754a23df3 Mon Sep 17 00:00:00 2001 From: skojoian Date: Sat, 23 Mar 2019 14:11:14 -0700 Subject: [PATCH 5/6] Move the bug fix documentation to the I/O section --- doc/source/whatsnew/v0.25.0.rst | 3 +-- pandas/tests/io/parser/test_usecols.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index c1225ff2a7ed9..1ff98b8b7b8d6 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -184,7 +184,6 @@ Bug Fixes ~~~~~~~~~ - Bug in :func:`to_datetime` which would raise an (incorrect) ``ValueError`` when called with a date far into the future and the ``format`` argument specified instead of raising ``OutOfBoundsDatetime`` (:issue:`23830`) - Bug in an error message in :meth:`DataFrame.plot`. Improved the error message if non-numerics are passed to :meth:`DataFrame.plot` (:issue:`25481`) -- Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`) Categorical ^^^^^^^^^^^ @@ -286,7 +285,7 @@ I/O - Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`) - Bug in :func:`read_csv` not properly interpreting the UTF8 encoded filenames on Windows on Python 3.6+ (:issue:`15086`) - Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`) - +- Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`) Plotting ^^^^^^^^ diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py index 0454ec9fc0ef2..9cd294198c9cb 100644 --- a/pandas/tests/io/parser/test_usecols.py +++ b/pandas/tests/io/parser/test_usecols.py @@ -30,7 +30,7 @@ def test_usecols_out_of_bounds(all_parsers, names, usecols, missing): data = "a,b,c\n1,2,3\n4,5,6" parser = all_parsers - + mssg = _msg_validate_usecols_names.format(missing) with pytest.raises(ValueError, match=mssg): parser.read_csv(StringIO(data), usecols=usecols, names=names) From 048094d5b2a80ff3fa2e8c17cc6d9c4eb64b2c44 Mon Sep 17 00:00:00 2001 From: skojoian Date: Sun, 24 Mar 2019 19:28:36 -0700 Subject: [PATCH 6/6] Add issue number as a comment in the test. Add 1-line comments describing the validations. --- doc/source/whatsnew/v0.25.0.rst | 3 +-- pandas/io/parsers.py | 5 ++++- pandas/tests/io/parser/test_usecols.py | 1 + 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1ff98b8b7b8d6..aa350c4a0a2e2 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -182,8 +182,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ -- Bug in :func:`to_datetime` which would raise an (incorrect) ``ValueError`` when called with a date far into the future and the ``format`` argument specified instead of raising ``OutOfBoundsDatetime`` (:issue:`23830`) -- Bug in an error message in :meth:`DataFrame.plot`. Improved the error message if non-numerics are passed to :meth:`DataFrame.plot` (:issue:`25481`) + Categorical ^^^^^^^^^^^ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index bf13958dec016..a8948f39a8583 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1902,6 +1902,7 @@ def __init__(self, src, **kwds): _validate_usecols_names(usecols, self.orig_names) # GH 25623 + # validate that column indices in usecols are not out of bounds elif self.usecols_dtype == 'integer': indices = lrange(self._reader.table_width) _validate_usecols_names(usecols, indices) @@ -2604,6 +2605,7 @@ def _infer_columns(self): self._clear_buffer() # GH 25623 + # validate that column indices in usecols are not out of bounds if self.usecols_dtype == 'integer': for col in columns: indices = lrange(len(col)) @@ -2644,7 +2646,8 @@ def _infer_columns(self): ncols = len(line) num_original_columns = ncols - # GH25623 + # GH 25623 + # validate that column indices in usecols are not out of bounds if self.usecols_dtype == 'integer': _validate_usecols_names(self.usecols, lrange(ncols)) diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py index 9cd294198c9cb..e513f2d755d07 100644 --- a/pandas/tests/io/parser/test_usecols.py +++ b/pandas/tests/io/parser/test_usecols.py @@ -28,6 +28,7 @@ (["a"], [3], r"\[3\]") ]) def test_usecols_out_of_bounds(all_parsers, names, usecols, missing): + # See gh-25623 data = "a,b,c\n1,2,3\n4,5,6" parser = all_parsers