From f9c120ab08808e4be3029228eb5e9dd6a10b411b Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 26 Nov 2021 23:05:07 +0100 Subject: [PATCH 1/2] BUG: read_csv not applying dtype to index col --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/io/parsers/base_parser.py | 26 +++++++++++++++++++++++- pandas/io/parsers/python_parser.py | 19 ++--------------- pandas/tests/io/parser/test_index_col.py | 11 ++++++++++ 4 files changed, 39 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 39e3894f86302..4b67ab33bbbc6 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -658,6 +658,7 @@ I/O - Bug in :func:`json_normalize` where multi-character ``sep`` parameter is incorrectly prefixed to every key (:issue:`43831`) - Bug in :func:`json_normalize` where reading data with missing multi-level metadata would not respect errors="ignore" (:issue:`44312`) - Bug in :func:`read_csv` with :code:`float_precision="round_trip"` which did not skip initial/trailing whitespace (:issue:`43713`) +- Bug in :func:`read_csv` not applying dtype for ``index_col`` (:issue:`9435`) - Bug in dumping/loading a :class:`DataFrame` with ``yaml.dump(frame)`` (:issue:`42748`) - Bug in :class:`ExcelWriter`, where ``engine_kwargs`` were not passed through to all engines (:issue:`43442`) - Bug in :func:`read_csv` raising ``ValueError`` when ``parse_dates`` was used with ``MultiIndex`` columns (:issue:`8991`) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 25a89d1c57006..c56032b49d081 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections import defaultdict +from copy import copy import csv import datetime from enum import Enum @@ -148,6 +149,8 @@ def __init__(self, kwds): self.na_filter = kwds.get("na_filter", False) self.keep_default_na = kwds.get("keep_default_na", True) + self.dtype = copy(kwds.get("dtype", None)) + self.true_values = kwds.get("true_values") self.false_values = kwds.get("false_values") self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True) @@ -498,6 +501,17 @@ def _get_name(icol): return index + def _clean_mapping(self, mapping): + """converts col numbers to names""" + if not isinstance(mapping, dict): + return mapping + clean = {} + for col, v in mapping.items(): + if isinstance(col, int) and col not in self.orig_names: + col = self.orig_names[col] + clean[col] = v + return clean + @final def _agg_index(self, index, try_parse_dates: bool = True) -> Index: arrays = [] @@ -522,7 +536,17 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: col_name, self.na_values, self.na_fvalues, self.keep_default_na ) - arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues) + clean_dtypes = self._clean_mapping(self.dtype) + + cast_type = None + if isinstance(clean_dtypes, dict) and self.index_names is not None: + cast_type = clean_dtypes.get(self.index_names[i], None) + + try_num_bool = not (cast_type and is_string_dtype(cast_type)) + + arr, _ = self._infer_types( + arr, col_na_values | col_na_fvalues, try_num_bool + ) arrays.append(arr) names = self.index_names diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index f5420618c0235..6800fca508adc 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -4,7 +4,6 @@ abc, defaultdict, ) -from copy import copy import csv from io import StringIO import re @@ -89,7 +88,6 @@ def __init__( self.verbose = kwds["verbose"] self.converters = kwds["converters"] - self.dtype = copy(kwds["dtype"]) self.thousands = kwds["thousands"] self.decimal = kwds["decimal"] @@ -308,21 +306,8 @@ def get_chunk(self, size=None): def _convert_data(self, data): # apply converters - def _clean_mapping(mapping): - """converts col numbers to names""" - clean = {} - for col, v in mapping.items(): - if isinstance(col, int) and col not in self.orig_names: - col = self.orig_names[col] - clean[col] = v - return clean - - clean_conv = _clean_mapping(self.converters) - if not isinstance(self.dtype, dict): - # handles single dtype applied to all columns - clean_dtypes = self.dtype - else: - clean_dtypes = _clean_mapping(self.dtype) + clean_conv = self._clean_mapping(self.converters) + clean_dtypes = self._clean_mapping(self.dtype) # Apply NA values. clean_na_values = {} diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 7315dcc0c4c07..58b5eebbec344 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -321,3 +321,14 @@ def test_infer_types_boolean_sum(all_parsers): # index column of dtype 'object', and the Python parser will return a # index column of dtype 'int64'. tm.assert_frame_equal(result, expected, check_index_type=False) + + +@skip_pyarrow +@pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)]) +def test_specify_dtype_for_index_col(all_parsers, dtype, val): + # GH#9435 + data = "a,b\n01,2" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype}) + expected = DataFrame({"b": [2]}, index=Index([val], name="a")) + tm.assert_frame_equal(result, expected) From 0cf6eeeffae69106c65472d164d9d3e2477ba1de Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 27 Nov 2021 00:15:16 +0100 Subject: [PATCH 2/2] Fix typing --- pandas/io/parsers/base_parser.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index c56032b49d081..f06950e3450a7 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -507,6 +507,8 @@ def _clean_mapping(self, mapping): return mapping clean = {} for col, v in mapping.items(): + # for mypy + assert self.orig_names is not None if isinstance(col, int) and col not in self.orig_names: col = self.orig_names[col] clean[col] = v