From 1f7534ca335cbd21fca1e499d771a213a165cdd7 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 18 Feb 2022 16:42:05 +0100 Subject: [PATCH 1/4] ENH: Add defaultdict support for dtype in read_csv --- doc/source/user_guide/io.rst | 5 ++++ doc/source/whatsnew/v1.5.0.rst | 1 + pandas/_libs/parsers.pyx | 5 ++++ pandas/io/parsers/base_parser.py | 3 +++ pandas/io/parsers/c_parser_wrapper.py | 9 ++++++- pandas/io/parsers/readers.py | 6 +++++ .../io/parser/dtypes/test_dtypes_basic.py | 27 +++++++++++++++++++ 7 files changed, 55 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index f11b6af24e4e4..e9773ace7c52e 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -186,6 +186,11 @@ dtype : Type name or dict of column -> type, default ``None`` (unsupported with ``engine='python'``). Use ``str`` or ``object`` together with suitable ``na_values`` settings to preserve and not interpret dtype. + .. versionadded:: 1.5.0 + + Support for defaultdict was added. Specify a defaultdict as input where + the default determines the dtype of the columns which are not explicitly + listed. engine : {``'c'``, ``'python'``, ``'pyarrow'``} Parser engine to use. The C and pyarrow engines are faster, while the python engine is currently more feature-complete. Multithreading is currently only supported by diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index c8b2617ffc535..e72e8f5fed358 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -37,6 +37,7 @@ Other enhancements - :meth:`to_numeric` now preserves float64 arrays when downcasting would generate values not representable in float32 (:issue:`43693`) - :meth:`Series.reset_index` and :meth:`DataFrame.reset_index` now support the argument ``allow_duplicates`` (:issue:`44410`) - :meth:`.GroupBy.min` and :meth:`.GroupBy.max` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`45428`) +- :func:`read_csv` now supports ``defaultdict`` as a ``dtype`` parameter (:issue:`41574`) - Implemented a ``bool``-dtype :class:`Index`, passing a bool-dtype array-like to ``pd.Index`` will now retain ``bool`` dtype instead of casting to ``object`` (:issue:`45061`) - Implemented a complex-dtype :class:`Index`, passing a complex-dtype array-like to ``pd.Index`` will now retain complex dtype instead of casting to ``object`` (:issue:`45845`) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index b4d2c60837a7e..28ddc13b0d45b 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1,6 +1,7 @@ # Copyright (c) 2012, Lambda Foundry, Inc. # See LICENSE for the license from base64 import decode +from collections import defaultdict from csv import ( QUOTE_MINIMAL, QUOTE_NONE, @@ -964,6 +965,8 @@ cdef class TextReader: results = {} nused = 0 + is_default_dict_dtype = isinstance(self.dtype, defaultdict) + for i in range(self.table_width): if i < self.leading_cols: # Pass through leading columns always @@ -994,6 +997,8 @@ cdef class TextReader: col_dtype = self.dtype[name] elif i in self.dtype: col_dtype = self.dtype[i] + elif is_default_dict_dtype: + col_dtype = self.dtype[name] else: if self.dtype.names: # structured array diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 7927439abb510..a3a980c8f3b81 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -471,6 +471,9 @@ def _clean_mapping(self, mapping): if isinstance(col, int) and col not in self.orig_names: col = self.orig_names[col] clean[col] = v + if isinstance(mapping, defaultdict): + remaining_cols = set(self.orig_names) - set(clean.keys()) + clean.update({col: mapping[col] for col in remaining_cols}) return clean @final diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index e8909f542f335..5801fab4a7e76 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -1,5 +1,6 @@ from __future__ import annotations +from collections import defaultdict from typing import ( Hashable, Mapping, @@ -415,7 +416,13 @@ def ensure_dtype_objs( Ensure we have either None, a dtype object, or a dictionary mapping to dtype objects. """ - if isinstance(dtype, dict): + if isinstance(dtype, defaultdict): + default_dtype = pandas_dtype(dtype.default_factory()) + dtype_converted = defaultdict(lambda: default_dtype) + for key in dtype.keys(): + dtype_converted[key] = pandas_dtype(dtype[key]) + return dtype_converted + elif isinstance(dtype, dict): return {k: pandas_dtype(dtype[k]) for k in dtype} elif dtype is not None: return pandas_dtype(dtype) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index c5b84dd18ec13..dc73bef4853ba 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -165,6 +165,12 @@ to preserve and not interpret dtype. If converters are specified, they will be applied INSTEAD of dtype conversion. + + .. versionadded:: 1.5.0 + + Support for defaultdict was added. Specify a defaultdict as input where + the default determines the dtype of the columns which are not explicitly + listed. engine : {{'c', 'python', 'pyarrow'}}, optional Parser engine to use. The C and pyarrow engines are faster, while the python engine is currently more feature-complete. Multithreading is currently only supported by diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 4d99b3c3c8c85..c350935bd4749 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -2,6 +2,7 @@ Tests dtype specification during parsing for all of the parsers defined in parsers.py """ +from collections import defaultdict from io import StringIO import numpy as np @@ -335,3 +336,29 @@ def test_nullable_int_dtype(all_parsers, any_int_ea_dtype): ) actual = parser.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(actual, expected) + + +@pytest.mark.parametrize("default", ["float", "float64"]) +def test_dtypes_defaultdict(all_parsers, default): + # GH#41574 + data = """a,b +1,2 +""" + dtype = defaultdict(lambda: "float64", a="int64") + parser = all_parsers + result = parser.read_csv(StringIO(data), dtype=dtype) + expected = DataFrame({"a": [1], "b": 2.0}) + tm.assert_frame_equal(result, expected) + + +def test_dtypes_defaultdict_mangle_dup_cols(all_parsers): + # GH#41574 + data = """a,b,a,b,b.1 +1,2,3,4,5 +""" + dtype = defaultdict(lambda: "float64", a="int64") + dtype["b.1"] = "int64" + parser = all_parsers + result = parser.read_csv(StringIO(data), dtype=dtype) + expected = DataFrame({"a": [1], "b": 2.0, "a.1": [3], "b.2": [4.0], "b.1": [5]}) + tm.assert_frame_equal(result, expected) From ac687e74af9d32bf70b927889a46ce88dd87a4cb Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 22 Feb 2022 23:55:39 +0100 Subject: [PATCH 2/4] Add test --- pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index c350935bd4749..ca0e92947f160 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -344,7 +344,7 @@ def test_dtypes_defaultdict(all_parsers, default): data = """a,b 1,2 """ - dtype = defaultdict(lambda: "float64", a="int64") + dtype = defaultdict(lambda: default, a="int64") parser = all_parsers result = parser.read_csv(StringIO(data), dtype=dtype) expected = DataFrame({"a": [1], "b": 2.0}) @@ -362,3 +362,14 @@ def test_dtypes_defaultdict_mangle_dup_cols(all_parsers): result = parser.read_csv(StringIO(data), dtype=dtype) expected = DataFrame({"a": [1], "b": 2.0, "a.1": [3], "b.2": [4.0], "b.1": [5]}) tm.assert_frame_equal(result, expected) + + +def test_dtypes_defaultdict_invalid(all_parsers): + # GH#41574 + data = """a,b +1,2 +""" + dtype = defaultdict(lambda: "invalid_dtype", a="int64") + parser = all_parsers + with pytest.raises(TypeError, match="not understood"): + parser.read_csv(StringIO(data), dtype=dtype) From 759bed4620a8a4992c836c762765aafe31e2d2e3 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 9 Mar 2022 22:17:07 +0100 Subject: [PATCH 3/4] Add brackets --- pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index ca0e92947f160..0f5ae06106f9b 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -360,7 +360,7 @@ def test_dtypes_defaultdict_mangle_dup_cols(all_parsers): dtype["b.1"] = "int64" parser = all_parsers result = parser.read_csv(StringIO(data), dtype=dtype) - expected = DataFrame({"a": [1], "b": 2.0, "a.1": [3], "b.2": [4.0], "b.1": [5]}) + expected = DataFrame({"a": [1], "b": [2.0], "a.1": [3], "b.2": [4.0], "b.1": [5]}) tm.assert_frame_equal(result, expected) From cdf7f2f613a1a8492d3b25f121f61529899cb7d2 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 9 Mar 2022 22:22:40 +0100 Subject: [PATCH 4/4] Fix mypy --- pandas/io/parsers/base_parser.py | 5 +++-- pandas/io/parsers/c_parser_wrapper.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index ca46af29543c1..a3344220122bb 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -466,9 +466,10 @@ def _clean_mapping(self, mapping): if not isinstance(mapping, dict): return mapping clean = {} + # for mypy + assert self.orig_names is not None + for col, v in mapping.items(): - # for mypy - assert self.orig_names is not None if isinstance(col, int) and col not in self.orig_names: col = self.orig_names[col] clean[col] = v diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 5801fab4a7e76..5facae764bde5 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -417,8 +417,9 @@ def ensure_dtype_objs( dtype objects. """ if isinstance(dtype, defaultdict): - default_dtype = pandas_dtype(dtype.default_factory()) - dtype_converted = defaultdict(lambda: default_dtype) + # "None" not callable [misc] + default_dtype = pandas_dtype(dtype.default_factory()) # type: ignore[misc] + dtype_converted: defaultdict = defaultdict(lambda: default_dtype) for key in dtype.keys(): dtype_converted[key] = pandas_dtype(dtype[key]) return dtype_converted