diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index a9219cd811b6f..3a8583d395cc4 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -186,6 +186,11 @@ dtype : Type name or dict of column -> type, default ``None`` (unsupported with ``engine='python'``). Use ``str`` or ``object`` together with suitable ``na_values`` settings to preserve and not interpret dtype. + .. versionadded:: 1.5.0 + + Support for defaultdict was added. Specify a defaultdict as input where + the default determines the dtype of the columns which are not explicitly + listed. engine : {``'c'``, ``'python'``, ``'pyarrow'``} Parser engine to use. The C and pyarrow engines are faster, while the python engine is currently more feature-complete. Multithreading is currently only supported by diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 14e9fda1b910c..1148e37fa9318 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -39,6 +39,7 @@ Other enhancements - :meth:`to_numeric` now preserves float64 arrays when downcasting would generate values not representable in float32 (:issue:`43693`) - :meth:`Series.reset_index` and :meth:`DataFrame.reset_index` now support the argument ``allow_duplicates`` (:issue:`44410`) - :meth:`.GroupBy.min` and :meth:`.GroupBy.max` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`45428`) +- :func:`read_csv` now supports ``defaultdict`` as a ``dtype`` parameter (:issue:`41574`) - :meth:`DataFrame.rolling` and :meth:`Series.rolling` now support a ``step`` parameter with fixed-length windows (:issue:`15354`) - Implemented a ``bool``-dtype :class:`Index`, passing a bool-dtype array-like to ``pd.Index`` will now retain ``bool`` dtype instead of casting to ``object`` (:issue:`45061`) - Implemented a complex-dtype :class:`Index`, passing a complex-dtype array-like to ``pd.Index`` will now retain complex dtype instead of casting to ``object`` (:issue:`45845`) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 6d96b5c0c9f03..926b7ffc0ac07 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1,6 +1,7 @@ # Copyright (c) 2012, Lambda Foundry, Inc. # See LICENSE for the license from base64 import decode +from collections import defaultdict from csv import ( QUOTE_MINIMAL, QUOTE_NONE, @@ -964,6 +965,8 @@ cdef class TextReader: results = {} nused = 0 + is_default_dict_dtype = isinstance(self.dtype, defaultdict) + for i in range(self.table_width): if i < self.leading_cols: # Pass through leading columns always @@ -994,6 +997,8 @@ cdef class TextReader: col_dtype = self.dtype[name] elif i in self.dtype: col_dtype = self.dtype[i] + elif is_default_dict_dtype: + col_dtype = self.dtype[name] else: if self.dtype.names: # structured array diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 705ee9c0d2716..2851ea36c8a33 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -466,12 +466,16 @@ def _clean_mapping(self, mapping): if not isinstance(mapping, dict): return mapping clean = {} + # for mypy + assert self.orig_names is not None + for col, v in mapping.items(): - # for mypy - assert self.orig_names is not None if isinstance(col, int) and col not in self.orig_names: col = self.orig_names[col] clean[col] = v + if isinstance(mapping, defaultdict): + remaining_cols = set(self.orig_names) - set(clean.keys()) + clean.update({col: mapping[col] for col in remaining_cols}) return clean @final diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 1d8cabb27c100..d18abc477665a 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -1,5 +1,6 @@ from __future__ import annotations +from collections import defaultdict from typing import ( Hashable, Mapping, @@ -415,7 +416,14 @@ def ensure_dtype_objs( Ensure we have either None, a dtype object, or a dictionary mapping to dtype objects. """ - if isinstance(dtype, dict): + if isinstance(dtype, defaultdict): + # "None" not callable [misc] + default_dtype = pandas_dtype(dtype.default_factory()) # type: ignore[misc] + dtype_converted: defaultdict = defaultdict(lambda: default_dtype) + for key in dtype.keys(): + dtype_converted[key] = pandas_dtype(dtype[key]) + return dtype_converted + elif isinstance(dtype, dict): return {k: pandas_dtype(dtype[k]) for k in dtype} elif dtype is not None: return pandas_dtype(dtype) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index ae748b278dad4..c639f408c75f5 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -168,6 +168,12 @@ to preserve and not interpret dtype. If converters are specified, they will be applied INSTEAD of dtype conversion. + + .. versionadded:: 1.5.0 + + Support for defaultdict was added. Specify a defaultdict as input where + the default determines the dtype of the columns which are not explicitly + listed. engine : {{'c', 'python', 'pyarrow'}}, optional Parser engine to use. The C and pyarrow engines are faster, while the python engine is currently more feature-complete. Multithreading is currently only supported by diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index eded3c1126bad..da53664a0278d 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -2,6 +2,7 @@ Tests dtype specification during parsing for all of the parsers defined in parsers.py """ +from collections import defaultdict from io import StringIO import numpy as np @@ -343,3 +344,40 @@ def test_nullable_int_dtype(all_parsers, any_int_ea_dtype): ) actual = parser.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(actual, expected) + + +@pytest.mark.parametrize("default", ["float", "float64"]) +def test_dtypes_defaultdict(all_parsers, default): + # GH#41574 + data = """a,b +1,2 +""" + dtype = defaultdict(lambda: default, a="int64") + parser = all_parsers + result = parser.read_csv(StringIO(data), dtype=dtype) + expected = DataFrame({"a": [1], "b": 2.0}) + tm.assert_frame_equal(result, expected) + + +def test_dtypes_defaultdict_mangle_dup_cols(all_parsers): + # GH#41574 + data = """a,b,a,b,b.1 +1,2,3,4,5 +""" + dtype = defaultdict(lambda: "float64", a="int64") + dtype["b.1"] = "int64" + parser = all_parsers + result = parser.read_csv(StringIO(data), dtype=dtype) + expected = DataFrame({"a": [1], "b": [2.0], "a.1": [3], "b.2": [4.0], "b.1": [5]}) + tm.assert_frame_equal(result, expected) + + +def test_dtypes_defaultdict_invalid(all_parsers): + # GH#41574 + data = """a,b +1,2 +""" + dtype = defaultdict(lambda: "invalid_dtype", a="int64") + parser = all_parsers + with pytest.raises(TypeError, match="not understood"): + parser.read_csv(StringIO(data), dtype=dtype)