From caec3c52501c3152959fddbc5bb19a809109d667 Mon Sep 17 00:00:00 2001
From: skojoian <sergei.kojoian@gmail.com>
Date: Tue, 12 Mar 2019 03:31:41 -0700
Subject: [PATCH 1/6] BUG: Raise ValueError if a column index in usecols is out
 of bounds. #25623

---
 doc/source/whatsnew/v0.24.2.rst        | 35 ++------------------------
 doc/source/whatsnew/v0.25.0.rst        |  2 +-
 pandas/io/parsers.py                   | 18 ++++++++++++-
 pandas/tests/io/parser/test_usecols.py | 14 +++++++++++
 4 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst
index 8da33a46e79c6..2c6d1e01ed89b 100644
--- a/doc/source/whatsnew/v0.24.2.rst
+++ b/doc/source/whatsnew/v0.24.2.rst
@@ -32,7 +32,6 @@ Fixed Regressions
 - Fixed regression in creating a period-dtype array from a read-only NumPy array of period objects. (:issue:`25403`)
 - Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`)
 - Fixed pip installing from source into an environment without NumPy (:issue:`25193`)
-- Fixed regression in :meth:`DataFrame.to_csv` writing duplicate line endings with gzip compress (:issue:`25311`)
 
 .. _whatsnew_0242.enhancements:
 
@@ -102,41 +101,11 @@ Bug Fixes
 - Bug in :meth:`Series.is_unique` where single occurrences of ``NaN`` were not considered unique (:issue:`25180`)
 - Bug in :func:`merge` when merging an empty ``DataFrame`` with an ``Int64`` column or a non-empty ``DataFrame`` with an ``Int64`` column that is all ``NaN`` (:issue:`25183`)
 - Bug in ``IntervalTree`` where a ``RecursionError`` occurs upon construction due to an overflow when adding endpoints, which also causes :class:`IntervalIndex` to crash during indexing operations (:issue:`25485`)
-- Bug in :attr:`Series.size` raising for some extension-array-backed ``Series``, rather than returning the size (:issue:`25580`)
-- Bug in resampling raising for nullable integer-dtype columns (:issue:`25580`)
+-
 
 .. _whatsnew_0242.contributors:
 
 Contributors
 ~~~~~~~~~~~~
 
-.. Including the contributors hardcoded for this release, as backporting with
-   MeeseeksDev loses the commit authors
-
-A total of 25 people contributed patches to this release. People with a "+" by their names contributed a patch for the first time.
-
-* Albert Villanova del Moral
-* Arno Veenstra +
-* chris-b1
-* Devin Petersohn +
-* EternalLearner42 +
-* Flavien Lambert +
-* gfyoung
-* Gioia Ballin
-* jbrockmendel
-* Jeff Reback
-* Jeremy Schendel
-* Johan von Forstner +
-* Joris Van den Bossche
-* Josh
-* Justin Zheng
-* Matthew Roeschke
-* Max Bolingbroke +
-* rbenes +
-* Sterling Paramore +
-* Tao He +
-* Thomas A Caswell
-* Tom Augspurger
-* Vibhu Agarwal +
-* William Ayd
-* Zach Angell
+.. contributors:: v0.24.1..v0.24.2
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
index 284943cf49070..feafb3cf006e8 100644
--- a/doc/source/whatsnew/v0.25.0.rst
+++ b/doc/source/whatsnew/v0.25.0.rst
@@ -122,7 +122,7 @@ Bug Fixes
 ~~~~~~~~~
 - Bug in :func:`to_datetime` which would raise an (incorrect) ``ValueError`` when called with a date far into the future and the ``format`` argument specified instead of raising ``OutOfBoundsDatetime`` (:issue:`23830`)
 - Bug in an error message in :meth:`DataFrame.plot`. Improved the error message if non-numerics are passed to :meth:`DataFrame.plot` (:issue:`25481`)
--
+- Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`)
 
 Categorical
 ^^^^^^^^^^^
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 4163a571df800..f1b133db8a5ef 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -1894,6 +1894,11 @@ def __init__(self, src, **kwds):
                     not set(usecols).issubset(self.orig_names)):
                 _validate_usecols_names(usecols, self.orig_names)
 
+            # GH 25623
+            elif self.usecols_dtype == 'integer':
+                indices = lrange(self._reader.table_width)
+                _validate_usecols_names(usecols, indices)
+
             if len(self.names) > len(usecols):
                 self.names = [n for i, n in enumerate(self.names)
                               if (i in usecols or n in usecols)]
@@ -2197,7 +2202,8 @@ def __init__(self, f, **kwds):
         self.skipinitialspace = kwds['skipinitialspace']
         self.lineterminator = kwds['lineterminator']
         self.quoting = kwds['quoting']
-        self.usecols, _ = _validate_usecols_arg(kwds['usecols'])
+        self.usecols, self.usecols_dtype = _validate_usecols_arg(
+            kwds['usecols'])
         self.skip_blank_lines = kwds['skip_blank_lines']
 
         self.warn_bad_lines = kwds['warn_bad_lines']
@@ -2588,6 +2594,12 @@ def _infer_columns(self):
             if clear_buffer:
                 self._clear_buffer()
 
+            # GH 25623
+            if self.usecols_dtype == 'integer':
+                for col in columns:
+                    indices = lrange(len(col))
+                    _validate_usecols_names(self.usecols, indices)
+
             if names is not None:
                 if ((self.usecols is not None and
                      len(names) != len(self.usecols)) or
@@ -2623,6 +2635,10 @@ def _infer_columns(self):
             ncols = len(line)
             num_original_columns = ncols
 
+            # GH25623
+            if self.usecols_dtype == 'integer':
+                _validate_usecols_names(self.usecols, lrange(ncols))
+
             if not names:
                 if self.prefix:
                     columns = [['%s%d' % (self.prefix, i)
diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py
index 652f78d198ee8..91993641201e2 100644
--- a/pandas/tests/io/parser/test_usecols.py
+++ b/pandas/tests/io/parser/test_usecols.py
@@ -21,6 +21,20 @@
                                "expected but not found: {0}")
 
 
+@pytest.mark.parametrize("names,usecols", [
+    (None, [0, 3]),
+    (["a", "b", "c"], [0, -1, 2]),
+    (None, [3]),
+    (["a"], [3])
+])
+def test_usecols_out_of_bounds(all_parsers, names, usecols):
+    data = "a,b,c\n1,2,3\n4,5,6"
+    parser = all_parsers
+
+    with pytest.raises(ValueError, match=_msg_validate_usecols_names):
+        parser.read_csv(StringIO(data), usecols=usecols, names=names)
+
+
 def test_raise_on_mixed_dtype_usecols(all_parsers):
     # See gh-12678
     data = """a,b,c

From 10126e1836e4b35afa44897e730a2b13f1f046a9 Mon Sep 17 00:00:00 2001
From: skojoian <sergei.kojoian@gmail.com>
Date: Tue, 12 Mar 2019 03:31:41 -0700
Subject: [PATCH 2/6] BUG: Raise ValueError if a column index in usecols is out
 of bounds. #25623

---
 doc/source/whatsnew/v0.24.2.rst | 44 +--------------------------------
 1 file changed, 1 insertion(+), 43 deletions(-)

diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst
index 2c6d1e01ed89b..5fcf6775d522e 100644
--- a/doc/source/whatsnew/v0.24.2.rst
+++ b/doc/source/whatsnew/v0.24.2.rst
@@ -18,7 +18,7 @@ including other versions of pandas.
 .. _whatsnew_0242.regressions:
 
 Fixed Regressions
-^^^^^^^^^^^^^^^^^
+~~~~~~~~~~~~~~~~~
 
 - Fixed regression in :meth:`DataFrame.all` and :meth:`DataFrame.any` where ``bool_only=True`` was ignored (:issue:`25101`)
 - Fixed issue in ``DataFrame`` construction with passing a mixed list of mixed types could segfault. (:issue:`25075`)
@@ -33,68 +33,26 @@ Fixed Regressions
 - Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`)
 - Fixed pip installing from source into an environment without NumPy (:issue:`25193`)
 
-.. _whatsnew_0242.enhancements:
-
-Enhancements
-^^^^^^^^^^^^
-
--
--
-
 .. _whatsnew_0242.bug_fixes:
 
 Bug Fixes
 ~~~~~~~~~
 
-**Conversion**
-
--
--
--
-
-**Indexing**
-
--
--
--
-
 **I/O**
 
 - Better handling of terminal printing when the terminal dimensions are not known (:issue:`25080`)
 - Bug in reading a HDF5 table-format ``DataFrame`` created in Python 2, in Python 3 (:issue:`24925`)
 - Bug in reading a JSON with ``orient='table'`` generated by :meth:`DataFrame.to_json` with ``index=False`` (:issue:`25170`)
 - Bug where float indexes could have misaligned values when printing (:issue:`25061`)
--
-
-**Categorical**
-
--
--
--
-
-**Timezones**
-
--
--
--
-
-**Timedelta**
-
--
--
--
 
 **Reshaping**
 
 - Bug in :meth:`~pandas.core.groupby.GroupBy.transform` where applying a function to a timezone aware column would return a timezone naive result (:issue:`24198`)
 - Bug in :func:`DataFrame.join` when joining on a timezone aware :class:`DatetimeIndex` (:issue:`23931`)
--
 
 **Visualization**
 
 - Bug in :meth:`Series.plot` where a secondary y axis could not be set to log scale (:issue:`25545`)
--
--
 
 **Other**
 

From 4eb55beec63be7644cf985ef10acc7c7843c3acd Mon Sep 17 00:00:00 2001
From: skojoian <sergei.kojoian@gmail.com>
Date: Tue, 12 Mar 2019 20:28:03 -0700
Subject: [PATCH 3/6] update v0.24.2.rst

---
 doc/source/whatsnew/v0.24.2.rst | 43 ++++++++++++++++++++++++++++++---
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst
index 5fcf6775d522e..6ad299de45e2a 100644
--- a/doc/source/whatsnew/v0.24.2.rst
+++ b/doc/source/whatsnew/v0.24.2.rst
@@ -2,8 +2,8 @@
 
 .. _whatsnew_0242:
 
-Whats New in 0.24.2 (February XX, 2019)
----------------------------------------
+Whats New in 0.24.2 (March 12, 2019)
+------------------------------------
 
 .. warning::
 
@@ -31,7 +31,11 @@ Fixed Regressions
 - Fixed regression in ``IntervalDtype`` construction where passing an incorrect string with 'Interval' as a prefix could result in a ``RecursionError``. (:issue:`25338`)
 - Fixed regression in creating a period-dtype array from a read-only NumPy array of period objects. (:issue:`25403`)
 - Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`)
+- Fixed regression in :func:`to_timedelta` losing precision when converting floating data to ``Timedelta`` data (:issue:`25077`).
 - Fixed pip installing from source into an environment without NumPy (:issue:`25193`)
+- Fixed regression in :meth:`DataFrame.replace` where large strings of numbers would be coerced into ``int64``, causing an ``OverflowError`` (:issue:`25616`)
+- Fixed regression in :func:`factorize` when passing a custom ``na_sentinel`` value with ``sort=True`` (:issue:`25409`).
+- Fixed regression in :meth:`DataFrame.to_csv` writing duplicate line endings with gzip compress (:issue:`25311`)
 
 .. _whatsnew_0242.bug_fixes:
 
@@ -59,11 +63,42 @@ Bug Fixes
 - Bug in :meth:`Series.is_unique` where single occurrences of ``NaN`` were not considered unique (:issue:`25180`)
 - Bug in :func:`merge` when merging an empty ``DataFrame`` with an ``Int64`` column or a non-empty ``DataFrame`` with an ``Int64`` column that is all ``NaN`` (:issue:`25183`)
 - Bug in ``IntervalTree`` where a ``RecursionError`` occurs upon construction due to an overflow when adding endpoints, which also causes :class:`IntervalIndex` to crash during indexing operations (:issue:`25485`)
--
+- Bug in :attr:`Series.size` raising for some extension-array-backed ``Series``, rather than returning the size (:issue:`25580`)
+- Bug in resampling raising for nullable integer-dtype columns (:issue:`25580`)
 
 .. _whatsnew_0242.contributors:
 
 Contributors
 ~~~~~~~~~~~~
 
-.. contributors:: v0.24.1..v0.24.2
+.. Including the contributors hardcoded for this release, as backporting with
+   MeeseeksDev loses the commit authors
+
+A total of 25 people contributed patches to this release. People with a "+" by their names contributed a patch for the first time.
+
+* Albert Villanova del Moral
+* Arno Veenstra +
+* chris-b1
+* Devin Petersohn +
+* EternalLearner42 +
+* Flavien Lambert +
+* gfyoung
+* Gioia Ballin
+* jbrockmendel
+* Jeff Reback
+* Jeremy Schendel
+* Johan von Forstner +
+* Joris Van den Bossche
+* Josh
+* Justin Zheng
+* Kendall Masse
+* Matthew Roeschke
+* Max Bolingbroke +
+* rbenes +
+* Sterling Paramore +
+* Tao He +
+* Thomas A Caswell
+* Tom Augspurger
+* Vibhu Agarwal +
+* William Ayd
+* Zach Angell

From 2d22a93ab965618d96d55eced7f4fd19b0804481 Mon Sep 17 00:00:00 2001
From: skojoian <sergei.kojoian@gmail.com>
Date: Sat, 23 Mar 2019 13:58:49 -0700
Subject: [PATCH 4/6] use string fromatting with _msg_validate_usecols_names to
 specify missing columns.

---
 pandas/tests/io/parser/test_usecols.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py
index 91993641201e2..0454ec9fc0ef2 100644
--- a/pandas/tests/io/parser/test_usecols.py
+++ b/pandas/tests/io/parser/test_usecols.py
@@ -21,17 +21,18 @@
                                "expected but not found: {0}")
 
 
-@pytest.mark.parametrize("names,usecols", [
-    (None, [0, 3]),
-    (["a", "b", "c"], [0, -1, 2]),
-    (None, [3]),
-    (["a"], [3])
+@pytest.mark.parametrize("names,usecols,missing", [
+    (None, [0, 3], r"\[3\]"),
+    (["a", "b", "c"], [0, -1, 2], r"\[-1\]"),
+    (None, [3], r"\[3\]"),
+    (["a"], [3], r"\[3\]")
 ])
-def test_usecols_out_of_bounds(all_parsers, names, usecols):
+def test_usecols_out_of_bounds(all_parsers, names, usecols, missing):
     data = "a,b,c\n1,2,3\n4,5,6"
     parser = all_parsers
-
-    with pytest.raises(ValueError, match=_msg_validate_usecols_names):
+    
+    mssg = _msg_validate_usecols_names.format(missing)
+    with pytest.raises(ValueError, match=mssg):
         parser.read_csv(StringIO(data), usecols=usecols, names=names)
 
 

From 2d8835d7d60c8077d24ccd0c4bc4414754a23df3 Mon Sep 17 00:00:00 2001
From: skojoian <sergei.kojoian@gmail.com>
Date: Sat, 23 Mar 2019 14:11:14 -0700
Subject: [PATCH 5/6] Move the bug fix documentation to the I/O section

---
 doc/source/whatsnew/v0.25.0.rst        | 3 +--
 pandas/tests/io/parser/test_usecols.py | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
index c1225ff2a7ed9..1ff98b8b7b8d6 100644
--- a/doc/source/whatsnew/v0.25.0.rst
+++ b/doc/source/whatsnew/v0.25.0.rst
@@ -184,7 +184,6 @@ Bug Fixes
 ~~~~~~~~~
 - Bug in :func:`to_datetime` which would raise an (incorrect) ``ValueError`` when called with a date far into the future and the ``format`` argument specified instead of raising ``OutOfBoundsDatetime`` (:issue:`23830`)
 - Bug in an error message in :meth:`DataFrame.plot`. Improved the error message if non-numerics are passed to :meth:`DataFrame.plot` (:issue:`25481`)
-- Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`)
 
 Categorical
 ^^^^^^^^^^^
@@ -286,7 +285,7 @@ I/O
 - Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`)
 - Bug in :func:`read_csv` not properly interpreting the UTF8 encoded filenames on Windows on Python 3.6+ (:issue:`15086`)
 - Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`)
-
+- Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py
index 0454ec9fc0ef2..9cd294198c9cb 100644
--- a/pandas/tests/io/parser/test_usecols.py
+++ b/pandas/tests/io/parser/test_usecols.py
@@ -30,7 +30,7 @@
 def test_usecols_out_of_bounds(all_parsers, names, usecols, missing):
     data = "a,b,c\n1,2,3\n4,5,6"
     parser = all_parsers
-    
+
     mssg = _msg_validate_usecols_names.format(missing)
     with pytest.raises(ValueError, match=mssg):
         parser.read_csv(StringIO(data), usecols=usecols, names=names)

From 048094d5b2a80ff3fa2e8c17cc6d9c4eb64b2c44 Mon Sep 17 00:00:00 2001
From: skojoian <sergei.kojoian@gmail.com>
Date: Sun, 24 Mar 2019 19:28:36 -0700
Subject: [PATCH 6/6] Add issue number as a comment in the test. Add 1-line
 comments describing the validations.

---
 doc/source/whatsnew/v0.25.0.rst        | 3 +--
 pandas/io/parsers.py                   | 5 ++++-
 pandas/tests/io/parser/test_usecols.py | 1 +
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
index 1ff98b8b7b8d6..aa350c4a0a2e2 100644
--- a/doc/source/whatsnew/v0.25.0.rst
+++ b/doc/source/whatsnew/v0.25.0.rst
@@ -182,8 +182,7 @@ Performance Improvements
 
 Bug Fixes
 ~~~~~~~~~
-- Bug in :func:`to_datetime` which would raise an (incorrect) ``ValueError`` when called with a date far into the future and the ``format`` argument specified instead of raising ``OutOfBoundsDatetime`` (:issue:`23830`)
-- Bug in an error message in :meth:`DataFrame.plot`. Improved the error message if non-numerics are passed to :meth:`DataFrame.plot` (:issue:`25481`)
+
 
 Categorical
 ^^^^^^^^^^^
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index bf13958dec016..a8948f39a8583 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -1902,6 +1902,7 @@ def __init__(self, src, **kwds):
                 _validate_usecols_names(usecols, self.orig_names)
 
             # GH 25623
+            # validate that column indices in usecols are not out of bounds
             elif self.usecols_dtype == 'integer':
                 indices = lrange(self._reader.table_width)
                 _validate_usecols_names(usecols, indices)
@@ -2604,6 +2605,7 @@ def _infer_columns(self):
                 self._clear_buffer()
 
             # GH 25623
+            # validate that column indices in usecols are not out of bounds
             if self.usecols_dtype == 'integer':
                 for col in columns:
                     indices = lrange(len(col))
@@ -2644,7 +2646,8 @@ def _infer_columns(self):
             ncols = len(line)
             num_original_columns = ncols
 
-            # GH25623
+            # GH 25623
+            # validate that column indices in usecols are not out of bounds
             if self.usecols_dtype == 'integer':
                 _validate_usecols_names(self.usecols, lrange(ncols))
 
diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py
index 9cd294198c9cb..e513f2d755d07 100644
--- a/pandas/tests/io/parser/test_usecols.py
+++ b/pandas/tests/io/parser/test_usecols.py
@@ -28,6 +28,7 @@
     (["a"], [3], r"\[3\]")
 ])
 def test_usecols_out_of_bounds(all_parsers, names, usecols, missing):
+    # See gh-25623
     data = "a,b,c\n1,2,3\n4,5,6"
     parser = all_parsers