Skip to content

ENH: Add defaultdict support for dtype in read_csv #46051

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Mar 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,11 @@ dtype : Type name or dict of column -> type, default ``None``
(unsupported with ``engine='python'``). Use ``str`` or ``object`` together
with suitable ``na_values`` settings to preserve and
not interpret dtype.
.. versionadded:: 1.5.0

Support for defaultdict was added. Specify a defaultdict as input where
the default determines the dtype of the columns which are not explicitly
listed.
engine : {``'c'``, ``'python'``, ``'pyarrow'``}
Parser engine to use. The C and pyarrow engines are faster, while the python engine
is currently more feature-complete. Multithreading is currently only supported by
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ Other enhancements
- :meth:`to_numeric` now preserves float64 arrays when downcasting would generate values not representable in float32 (:issue:`43693`)
- :meth:`Series.reset_index` and :meth:`DataFrame.reset_index` now support the argument ``allow_duplicates`` (:issue:`44410`)
- :meth:`.GroupBy.min` and :meth:`.GroupBy.max` now supports `Numba <https://numba.pydata.org/>`_ execution with the ``engine`` keyword (:issue:`45428`)
- :func:`read_csv` now supports ``defaultdict`` as a ``dtype`` parameter (:issue:`41574`)
- :meth:`DataFrame.rolling` and :meth:`Series.rolling` now support a ``step`` parameter with fixed-length windows (:issue:`15354`)
- Implemented a ``bool``-dtype :class:`Index`, passing a bool-dtype array-like to ``pd.Index`` will now retain ``bool`` dtype instead of casting to ``object`` (:issue:`45061`)
- Implemented a complex-dtype :class:`Index`, passing a complex-dtype array-like to ``pd.Index`` will now retain complex dtype instead of casting to ``object`` (:issue:`45845`)
Expand Down
5 changes: 5 additions & 0 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) 2012, Lambda Foundry, Inc.
# See LICENSE for the license
from base64 import decode
from collections import defaultdict
from csv import (
QUOTE_MINIMAL,
QUOTE_NONE,
Expand Down Expand Up @@ -964,6 +965,8 @@ cdef class TextReader:

results = {}
nused = 0
is_default_dict_dtype = isinstance(self.dtype, defaultdict)

for i in range(self.table_width):
if i < self.leading_cols:
# Pass through leading columns always
Expand Down Expand Up @@ -994,6 +997,8 @@ cdef class TextReader:
col_dtype = self.dtype[name]
elif i in self.dtype:
col_dtype = self.dtype[i]
elif is_default_dict_dtype:
col_dtype = self.dtype[name]
else:
if self.dtype.names:
# structured array
Expand Down
8 changes: 6 additions & 2 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,12 +466,16 @@ def _clean_mapping(self, mapping):
if not isinstance(mapping, dict):
return mapping
clean = {}
# for mypy
assert self.orig_names is not None

for col, v in mapping.items():
# for mypy
assert self.orig_names is not None
if isinstance(col, int) and col not in self.orig_names:
col = self.orig_names[col]
clean[col] = v
if isinstance(mapping, defaultdict):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could also support defaultdicts directly instead of converting here. but this simplifies adding support for converters to and also we do not have to keep defaultdicts in mind every time we casses the dtypes

remaining_cols = set(self.orig_names) - set(clean.keys())
clean.update({col: mapping[col] for col in remaining_cols})
return clean

@final
Expand Down
10 changes: 9 additions & 1 deletion pandas/io/parsers/c_parser_wrapper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

from collections import defaultdict
from typing import (
Hashable,
Mapping,
Expand Down Expand Up @@ -415,7 +416,14 @@ def ensure_dtype_objs(
Ensure we have either None, a dtype object, or a dictionary mapping to
dtype objects.
"""
if isinstance(dtype, dict):
if isinstance(dtype, defaultdict):
# "None" not callable [misc]
default_dtype = pandas_dtype(dtype.default_factory()) # type: ignore[misc]
dtype_converted: defaultdict = defaultdict(lambda: default_dtype)
for key in dtype.keys():
dtype_converted[key] = pandas_dtype(dtype[key])
return dtype_converted
elif isinstance(dtype, dict):
return {k: pandas_dtype(dtype[k]) for k in dtype}
elif dtype is not None:
return pandas_dtype(dtype)
Expand Down
6 changes: 6 additions & 0 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,12 @@
to preserve and not interpret dtype.
If converters are specified, they will be applied INSTEAD
of dtype conversion.

.. versionadded:: 1.5.0

Support for defaultdict was added. Specify a defaultdict as input where
the default determines the dtype of the columns which are not explicitly
listed.
engine : {{'c', 'python', 'pyarrow'}}, optional
Parser engine to use. The C and pyarrow engines are faster, while the python engine
is currently more feature-complete. Multithreading is currently only supported by
Expand Down
38 changes: 38 additions & 0 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Tests dtype specification during parsing
for all of the parsers defined in parsers.py
"""
from collections import defaultdict
from io import StringIO

import numpy as np
Expand Down Expand Up @@ -343,3 +344,40 @@ def test_nullable_int_dtype(all_parsers, any_int_ea_dtype):
)
actual = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(actual, expected)


@pytest.mark.parametrize("default", ["float", "float64"])
def test_dtypes_defaultdict(all_parsers, default):
# GH#41574
data = """a,b
1,2
"""
dtype = defaultdict(lambda: default, a="int64")
parser = all_parsers
result = parser.read_csv(StringIO(data), dtype=dtype)
expected = DataFrame({"a": [1], "b": 2.0})
tm.assert_frame_equal(result, expected)


def test_dtypes_defaultdict_mangle_dup_cols(all_parsers):
# GH#41574
data = """a,b,a,b,b.1
1,2,3,4,5
"""
dtype = defaultdict(lambda: "float64", a="int64")
dtype["b.1"] = "int64"
parser = all_parsers
result = parser.read_csv(StringIO(data), dtype=dtype)
expected = DataFrame({"a": [1], "b": [2.0], "a.1": [3], "b.2": [4.0], "b.1": [5]})
tm.assert_frame_equal(result, expected)


def test_dtypes_defaultdict_invalid(all_parsers):
# GH#41574
data = """a,b
1,2
"""
dtype = defaultdict(lambda: "invalid_dtype", a="int64")
parser = all_parsers
with pytest.raises(TypeError, match="not understood"):
parser.read_csv(StringIO(data), dtype=dtype)