diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index 82d43db667550..a41be65cc3b79 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -39,6 +39,8 @@ Bug Fixes - Compat with ``dateutil==2.6.0``; segfault reported in the testing suite (:issue:`14621`) - Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`) +- Bug in ``pd.read_csv`` in which aliasing was being done for ``na_values`` when passed in as a dictionary (:issue:`14203`) +- Bug in ``pd.read_csv`` in which column indices for a dict-like ``na_values`` were not being respected (:issue:`14203`) - Bug in ``pd.read_csv`` where reading files fails, if the number of headers is equal to the number of lines in the file (:issue:`14515`) - Bug in ``pd.read_csv`` for the Python engine in which an unhelpful error message was being raised when multi-char delimiters were not being respected with quotes (:issue:`14582`) - Fix bugs (:issue:`14734`, :issue:`13654`) in ``pd.read_sas`` and ``pandas.io.sas.sas7bdat.SAS7BDATReader`` that caused problems when reading a SAS file incrementally. diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 200943324ce66..8e4246787ed5b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2055,9 +2055,27 @@ def _clean_mapping(mapping): else: clean_dtypes = _clean_mapping(self.dtype) - return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues, - self.verbose, clean_conv, - clean_dtypes) + # Apply NA values. + clean_na_values = {} + clean_na_fvalues = {} + + if isinstance(self.na_values, dict): + for col in self.na_values: + na_value = self.na_values[col] + na_fvalue = self.na_fvalues[col] + + if isinstance(col, int) and col not in self.orig_names: + col = self.orig_names[col] + + clean_na_values[col] = na_value + clean_na_fvalues[col] = na_fvalue + else: + clean_na_values = self.na_values + clean_na_fvalues = self.na_fvalues + + return self._convert_to_ndarrays(data, clean_na_values, + clean_na_fvalues, self.verbose, + clean_conv, clean_dtypes) def _to_recarray(self, data, columns): dtypes = [] @@ -2767,6 +2785,7 @@ def _clean_na_values(na_values, keep_default_na=True): na_values = [] na_fvalues = set() elif isinstance(na_values, dict): + na_values = na_values.copy() # Prevent aliasing. if keep_default_na: for k, v in compat.iteritems(na_values): if not is_list_like(v): diff --git a/pandas/io/tests/parser/na_values.py b/pandas/io/tests/parser/na_values.py index 92107cf2e82a7..e245bc5589145 100644 --- a/pandas/io/tests/parser/na_values.py +++ b/pandas/io/tests/parser/na_values.py @@ -266,3 +266,26 @@ def test_na_values_scalar(self): out = self.read_csv(StringIO(data), names=names, na_values={'a': 2, 'b': 1}) tm.assert_frame_equal(out, expected) + + def test_na_values_dict_aliasing(self): + na_values = {'a': 2, 'b': 1} + na_values_copy = na_values.copy() + + names = ['a', 'b'] + data = '1,2\n2,1' + + expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names) + out = self.read_csv(StringIO(data), names=names, na_values=na_values) + + tm.assert_frame_equal(out, expected) + tm.assert_dict_equal(na_values, na_values_copy) + + def test_na_values_dict_col_index(self): + # see gh-14203 + + data = 'a\nfoo\n1' + na_values = {0: 'foo'} + + out = self.read_csv(StringIO(data), na_values=na_values) + expected = DataFrame({'a': [np.nan, 1]}) + tm.assert_frame_equal(out, expected) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index d94a4ef278dee..8e52fc117b401 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -1262,19 +1262,23 @@ cdef class TextReader: return None, set() if isinstance(self.na_values, dict): + key = None values = None + if name is not None and name in self.na_values: - values = self.na_values[name] - if values is not None and not isinstance(values, list): - values = list(values) - fvalues = self.na_fvalues[name] - if fvalues is not None and not isinstance(fvalues, set): - fvalues = set(fvalues) - else: - if i in self.na_values: - return self.na_values[i], self.na_fvalues[i] - else: - return _NA_VALUES, set() + key = name + elif i in self.na_values: + key = i + else: # No na_values provided for this column. + return _NA_VALUES, set() + + values = self.na_values[key] + if values is not None and not isinstance(values, list): + values = list(values) + + fvalues = self.na_fvalues[key] + if fvalues is not None and not isinstance(fvalues, set): + fvalues = set(fvalues) return _ensure_encoded(values), fvalues else: