From 812f9288ebf00155a39ef25751fabaaccfa5cc68 Mon Sep 17 00:00:00 2001 From: "Praggastis, Brenda" Date: Mon, 22 May 2017 22:29:49 -0700 Subject: [PATCH 1/3] gh-14671 Check if usecols with type string contains a subset of names, if not throws an error --- pandas/io/parsers.py | 6 ++++++ pandas/tests/io/parser/usecols.py | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index aab70c8ce2cd4..5f73a2e589c8a 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1649,6 +1649,12 @@ def __init__(self, src, **kwds): if self.usecols: usecols = _evaluate_usecols(self.usecols, self.orig_names) + + #gh-14671 + if (self.usecols_dtype == 'string') and \ + (not set(usecols).issubset(self.orig_names)): + raise ValueError("Usecols do not match names.") + if len(self.names) > len(usecols): self.names = [n for i, n in enumerate(self.names) if (i in usecols or n in usecols)] diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py index 8761d1ccd3da4..85fdf802d60b7 100644 --- a/pandas/tests/io/parser/usecols.py +++ b/pandas/tests/io/parser/usecols.py @@ -9,6 +9,7 @@ import numpy as np import pandas.util.testing as tm +import re from pandas import DataFrame, Index from pandas._libs.lib import Timestamp @@ -475,3 +476,20 @@ def test_uneven_length_cols(self): 'C': [3, 5, 4, 3, 3, 7]}) df = self.read_csv(StringIO(data), usecols=usecols) tm.assert_frame_equal(df, expected) + + def test_raise_on_usecols_names_mismatch(self): + # see gh-14671 + data = 'a,b,c,d\n1,2,3,4\n5,6,7,8' + usecols = ['a','b','c','d'] + df = self.read_csv(StringIO(data), usecols=usecols) + expected = DataFrame({'a': [1,5], 'b': [2,6], 'c': [3,7], 'd': [4,8]}) + tm.assert_frame_equal(df, expected) + + msg = 'Usecols do not match names' ## from parsers.py CParserWrapper() + msg2 = 'is not in list' ## from parser.py _handle_usecols() + usecols = ['a','b','c','f'] + with tm.assert_raises_regex(ValueError, re.compile("'" + msg + '||' + msg2 + "'")): + self.read_csv(StringIO(data), usecols=usecols) + usecols = ['a','b','f'] + with tm.assert_raises_regex(ValueError, re.compile("'" + msg + '||' + msg2 + "'")): + self.read_csv(StringIO(data), usecols=usecols) From 1968a70a4b821b0075d87c3b37e273ec876d84bf Mon Sep 17 00:00:00 2001 From: "Praggastis, Brenda" Date: Tue, 23 May 2017 10:13:18 -0700 Subject: [PATCH 2/3] tests added for gh-14671, expected behavior of simultaneous use of usecols and names unclear so these tests are commented out --- pandas/tests/io/parser/usecols.py | 32 ++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py index 85fdf802d60b7..44c3c4dcffccd 100644 --- a/pandas/tests/io/parser/usecols.py +++ b/pandas/tests/io/parser/usecols.py @@ -478,18 +478,44 @@ def test_uneven_length_cols(self): tm.assert_frame_equal(df, expected) def test_raise_on_usecols_names_mismatch(self): - # see gh-14671 + ## see gh-14671 data = 'a,b,c,d\n1,2,3,4\n5,6,7,8' + msg = 'Usecols do not match names' ## from parsers.py CParserWrapper() + msg2 = 'is not in list' ## from parser.py _handle_usecols() + usecols = ['a','b','c','d'] df = self.read_csv(StringIO(data), usecols=usecols) expected = DataFrame({'a': [1,5], 'b': [2,6], 'c': [3,7], 'd': [4,8]}) tm.assert_frame_equal(df, expected) - msg = 'Usecols do not match names' ## from parsers.py CParserWrapper() - msg2 = 'is not in list' ## from parser.py _handle_usecols() usecols = ['a','b','c','f'] with tm.assert_raises_regex(ValueError, re.compile("'" + msg + '||' + msg2 + "'")): self.read_csv(StringIO(data), usecols=usecols) + usecols = ['a','b','f'] with tm.assert_raises_regex(ValueError, re.compile("'" + msg + '||' + msg2 + "'")): self.read_csv(StringIO(data), usecols=usecols) + + names = ['A', 'B', 'C', 'D'] + + df = self.read_csv(StringIO(data), header=0, names=names) + expected = DataFrame({'A': [1,5], 'B': [2,6], 'C': [3,7], 'D': [4,8]}) + tm.assert_frame_equal(df, expected) + + # usecols = ['A','C'] + # df = self.read_csv(StringIO(data), header=0, names=names, usecols=usecols) + # expected = DataFrame({'A': [1,5], 'C': [3,7]}) + # tm.assert_frame_equal(df, expected) + # + # usecols = [0,2] + # df = self.read_csv(StringIO(data), header=0, names=names, usecols=usecols) + # expected = DataFrame({'A': [1,5], 'C': [3,7]}) + # tm.assert_frame_equal(df, expected) + + + usecols = ['A','B','C','f'] + with tm.assert_raises_regex(ValueError, re.compile("'" + msg + '||' + msg2 + "'")): + self.read_csv(StringIO(data), header=0, names=names, usecols=usecols) + usecols = ['A','B','f'] + with tm.assert_raises_regex(ValueError, re.compile("'" + msg + '||' + msg2 + "'")): + self.read_csv(StringIO(data), names=names, usecols=usecols) From 3418bdeb535aa82ac36dada5302ae3fb845b570d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 3 Jun 2017 20:14:43 -0500 Subject: [PATCH 3/3] Review comments --- doc/source/whatsnew/v0.20.2.txt | 1 + pandas/io/parsers.py | 8 +++--- pandas/tests/io/parser/usecols.py | 45 ++++++++++++++++++------------- 3 files changed, 31 insertions(+), 23 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index c9486954258c8..ea92c45b7e35b 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -72,6 +72,7 @@ I/O ^^^ - Bug in :func:`read_csv` when ``comment`` is passed in a space delimited text file (:issue:`16472`) +- Bug in :func:`read_csv` not raising an exception with nonexistent columns in ``usecols`` when it had the correct length (:issue:`14671`) - Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`) - Bug that raised ``IndexError`` when HTML-rendering an empty ``DataFrame`` (:issue:`15953`) - Bug in :func:`read_csv` in which tarfile object inputs were raising an error in Python 2.x for the C engine (:issue:`16530`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5f73a2e589c8a..055d6d045d2f2 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1650,10 +1650,10 @@ def __init__(self, src, **kwds): if self.usecols: usecols = _evaluate_usecols(self.usecols, self.orig_names) - #gh-14671 - if (self.usecols_dtype == 'string') and \ - (not set(usecols).issubset(self.orig_names)): - raise ValueError("Usecols do not match names.") + # GH 14671 + if (self.usecols_dtype == 'string' and + not set(usecols).issubset(self.orig_names)): + raise ValueError("Usecols do not match names.") if len(self.names) > len(usecols): self.names = [n for i, n in enumerate(self.names) diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py index 44c3c4dcffccd..f582e5037ca07 100644 --- a/pandas/tests/io/parser/usecols.py +++ b/pandas/tests/io/parser/usecols.py @@ -9,7 +9,6 @@ import numpy as np import pandas.util.testing as tm -import re from pandas import DataFrame, Index from pandas._libs.lib import Timestamp @@ -478,44 +477,52 @@ def test_uneven_length_cols(self): tm.assert_frame_equal(df, expected) def test_raise_on_usecols_names_mismatch(self): - ## see gh-14671 + # GH 14671 data = 'a,b,c,d\n1,2,3,4\n5,6,7,8' - msg = 'Usecols do not match names' ## from parsers.py CParserWrapper() - msg2 = 'is not in list' ## from parser.py _handle_usecols() - usecols = ['a','b','c','d'] + if self.engine == 'c': + msg = 'Usecols do not match names' + else: + msg = 'is not in list' + + usecols = ['a', 'b', 'c', 'd'] df = self.read_csv(StringIO(data), usecols=usecols) - expected = DataFrame({'a': [1,5], 'b': [2,6], 'c': [3,7], 'd': [4,8]}) + expected = DataFrame({'a': [1, 5], 'b': [2, 6], 'c': [3, 7], + 'd': [4, 8]}) tm.assert_frame_equal(df, expected) - usecols = ['a','b','c','f'] - with tm.assert_raises_regex(ValueError, re.compile("'" + msg + '||' + msg2 + "'")): + usecols = ['a', 'b', 'c', 'f'] + with tm.assert_raises_regex(ValueError, msg): self.read_csv(StringIO(data), usecols=usecols) - usecols = ['a','b','f'] - with tm.assert_raises_regex(ValueError, re.compile("'" + msg + '||' + msg2 + "'")): + usecols = ['a', 'b', 'f'] + with tm.assert_raises_regex(ValueError, msg): self.read_csv(StringIO(data), usecols=usecols) names = ['A', 'B', 'C', 'D'] df = self.read_csv(StringIO(data), header=0, names=names) - expected = DataFrame({'A': [1,5], 'B': [2,6], 'C': [3,7], 'D': [4,8]}) + expected = DataFrame({'A': [1, 5], 'B': [2, 6], 'C': [3, 7], + 'D': [4, 8]}) tm.assert_frame_equal(df, expected) + # TODO: https://github.com/pandas-dev/pandas/issues/16469 # usecols = ['A','C'] - # df = self.read_csv(StringIO(data), header=0, names=names, usecols=usecols) + # df = self.read_csv(StringIO(data), header=0, names=names, + # usecols=usecols) # expected = DataFrame({'A': [1,5], 'C': [3,7]}) # tm.assert_frame_equal(df, expected) # # usecols = [0,2] - # df = self.read_csv(StringIO(data), header=0, names=names, usecols=usecols) + # df = self.read_csv(StringIO(data), header=0, names=names, + # usecols=usecols) # expected = DataFrame({'A': [1,5], 'C': [3,7]}) # tm.assert_frame_equal(df, expected) - - usecols = ['A','B','C','f'] - with tm.assert_raises_regex(ValueError, re.compile("'" + msg + '||' + msg2 + "'")): - self.read_csv(StringIO(data), header=0, names=names, usecols=usecols) - usecols = ['A','B','f'] - with tm.assert_raises_regex(ValueError, re.compile("'" + msg + '||' + msg2 + "'")): + usecols = ['A', 'B', 'C', 'f'] + with tm.assert_raises_regex(ValueError, msg): + self.read_csv(StringIO(data), header=0, names=names, + usecols=usecols) + usecols = ['A', 'B', 'f'] + with tm.assert_raises_regex(ValueError, msg): self.read_csv(StringIO(data), names=names, usecols=usecols)