From 1f7534ca335cbd21fca1e499d771a213a165cdd7 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Fri, 18 Feb 2022 16:42:05 +0100
Subject: [PATCH 1/4] ENH: Add defaultdict support for dtype in read_csv

---
 doc/source/user_guide/io.rst                  |  5 ++++
 doc/source/whatsnew/v1.5.0.rst                |  1 +
 pandas/_libs/parsers.pyx                      |  5 ++++
 pandas/io/parsers/base_parser.py              |  3 +++
 pandas/io/parsers/c_parser_wrapper.py         |  9 ++++++-
 pandas/io/parsers/readers.py                  |  6 +++++
 .../io/parser/dtypes/test_dtypes_basic.py     | 27 +++++++++++++++++++
 7 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index f11b6af24e4e4..e9773ace7c52e 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -186,6 +186,11 @@ dtype : Type name or dict of column -> type, default ``None``
   (unsupported with ``engine='python'``). Use ``str`` or ``object`` together
   with suitable ``na_values`` settings to preserve and
   not interpret dtype.
+  .. versionadded:: 1.5.0
+
+    Support for defaultdict was added. Specify a defaultdict as input where
+    the default determines the dtype of the columns which are not explicitly
+    listed.
 engine : {``'c'``, ``'python'``, ``'pyarrow'``}
   Parser engine to use. The C and pyarrow engines are faster, while the python engine
   is currently more feature-complete. Multithreading is currently only supported by
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index c8b2617ffc535..e72e8f5fed358 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -37,6 +37,7 @@ Other enhancements
 - :meth:`to_numeric` now preserves float64 arrays when downcasting would generate values not representable in float32 (:issue:`43693`)
 - :meth:`Series.reset_index` and :meth:`DataFrame.reset_index` now support the argument ``allow_duplicates`` (:issue:`44410`)
 - :meth:`.GroupBy.min` and :meth:`.GroupBy.max` now supports `Numba <https://numba.pydata.org/>`_ execution with the ``engine`` keyword (:issue:`45428`)
+- :func:`read_csv` now supports ``defaultdict`` as a ``dtype`` parameter (:issue:`41574`)
 - Implemented a ``bool``-dtype :class:`Index`, passing a bool-dtype array-like to ``pd.Index`` will now retain ``bool`` dtype instead of casting to ``object`` (:issue:`45061`)
 - Implemented a complex-dtype :class:`Index`, passing a complex-dtype array-like to ``pd.Index`` will now retain complex dtype instead of casting to ``object`` (:issue:`45845`)
 
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index b4d2c60837a7e..28ddc13b0d45b 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -1,6 +1,7 @@
 # Copyright (c) 2012, Lambda Foundry, Inc.
 # See LICENSE for the license
 from base64 import decode
+from collections import defaultdict
 from csv import (
     QUOTE_MINIMAL,
     QUOTE_NONE,
@@ -964,6 +965,8 @@ cdef class TextReader:
 
         results = {}
         nused = 0
+        is_default_dict_dtype = isinstance(self.dtype, defaultdict)
+
         for i in range(self.table_width):
             if i < self.leading_cols:
                 # Pass through leading columns always
@@ -994,6 +997,8 @@ cdef class TextReader:
                         col_dtype = self.dtype[name]
                     elif i in self.dtype:
                         col_dtype = self.dtype[i]
+                    elif is_default_dict_dtype:
+                        col_dtype = self.dtype[name]
                 else:
                     if self.dtype.names:
                         # structured array
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index 7927439abb510..a3a980c8f3b81 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -471,6 +471,9 @@ def _clean_mapping(self, mapping):
             if isinstance(col, int) and col not in self.orig_names:
                 col = self.orig_names[col]
             clean[col] = v
+        if isinstance(mapping, defaultdict):
+            remaining_cols = set(self.orig_names) - set(clean.keys())
+            clean.update({col: mapping[col] for col in remaining_cols})
         return clean
 
     @final
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
index e8909f542f335..5801fab4a7e76 100644
--- a/pandas/io/parsers/c_parser_wrapper.py
+++ b/pandas/io/parsers/c_parser_wrapper.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from collections import defaultdict
 from typing import (
     Hashable,
     Mapping,
@@ -415,7 +416,13 @@ def ensure_dtype_objs(
     Ensure we have either None, a dtype object, or a dictionary mapping to
     dtype objects.
     """
-    if isinstance(dtype, dict):
+    if isinstance(dtype, defaultdict):
+        default_dtype = pandas_dtype(dtype.default_factory())
+        dtype_converted = defaultdict(lambda: default_dtype)
+        for key in dtype.keys():
+            dtype_converted[key] = pandas_dtype(dtype[key])
+        return dtype_converted
+    elif isinstance(dtype, dict):
         return {k: pandas_dtype(dtype[k]) for k in dtype}
     elif dtype is not None:
         return pandas_dtype(dtype)
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
index c5b84dd18ec13..dc73bef4853ba 100644
--- a/pandas/io/parsers/readers.py
+++ b/pandas/io/parsers/readers.py
@@ -165,6 +165,12 @@
     to preserve and not interpret dtype.
     If converters are specified, they will be applied INSTEAD
     of dtype conversion.
+
+    .. versionadded:: 1.5.0
+
+        Support for defaultdict was added. Specify a defaultdict as input where
+        the default determines the dtype of the columns which are not explicitly
+        listed.
 engine : {{'c', 'python', 'pyarrow'}}, optional
     Parser engine to use. The C and pyarrow engines are faster, while the python engine
     is currently more feature-complete. Multithreading is currently only supported by
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
index 4d99b3c3c8c85..c350935bd4749 100644
--- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
+++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -2,6 +2,7 @@
 Tests dtype specification during parsing
 for all of the parsers defined in parsers.py
 """
+from collections import defaultdict
 from io import StringIO
 
 import numpy as np
@@ -335,3 +336,29 @@ def test_nullable_int_dtype(all_parsers, any_int_ea_dtype):
     )
     actual = parser.read_csv(StringIO(data), dtype=dtype)
     tm.assert_frame_equal(actual, expected)
+
+
+@pytest.mark.parametrize("default", ["float", "float64"])
+def test_dtypes_defaultdict(all_parsers, default):
+    # GH#41574
+    data = """a,b
+1,2
+"""
+    dtype = defaultdict(lambda: "float64", a="int64")
+    parser = all_parsers
+    result = parser.read_csv(StringIO(data), dtype=dtype)
+    expected = DataFrame({"a": [1], "b": 2.0})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_dtypes_defaultdict_mangle_dup_cols(all_parsers):
+    # GH#41574
+    data = """a,b,a,b,b.1
+1,2,3,4,5
+"""
+    dtype = defaultdict(lambda: "float64", a="int64")
+    dtype["b.1"] = "int64"
+    parser = all_parsers
+    result = parser.read_csv(StringIO(data), dtype=dtype)
+    expected = DataFrame({"a": [1], "b": 2.0, "a.1": [3], "b.2": [4.0], "b.1": [5]})
+    tm.assert_frame_equal(result, expected)

From ac687e74af9d32bf70b927889a46ce88dd87a4cb Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Tue, 22 Feb 2022 23:55:39 +0100
Subject: [PATCH 2/4] Add test

---
 pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
index c350935bd4749..ca0e92947f160 100644
--- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
+++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -344,7 +344,7 @@ def test_dtypes_defaultdict(all_parsers, default):
     data = """a,b
 1,2
 """
-    dtype = defaultdict(lambda: "float64", a="int64")
+    dtype = defaultdict(lambda: default, a="int64")
     parser = all_parsers
     result = parser.read_csv(StringIO(data), dtype=dtype)
     expected = DataFrame({"a": [1], "b": 2.0})
@@ -362,3 +362,14 @@ def test_dtypes_defaultdict_mangle_dup_cols(all_parsers):
     result = parser.read_csv(StringIO(data), dtype=dtype)
     expected = DataFrame({"a": [1], "b": 2.0, "a.1": [3], "b.2": [4.0], "b.1": [5]})
     tm.assert_frame_equal(result, expected)
+
+
+def test_dtypes_defaultdict_invalid(all_parsers):
+    # GH#41574
+    data = """a,b
+1,2
+"""
+    dtype = defaultdict(lambda: "invalid_dtype", a="int64")
+    parser = all_parsers
+    with pytest.raises(TypeError, match="not understood"):
+        parser.read_csv(StringIO(data), dtype=dtype)

From 759bed4620a8a4992c836c762765aafe31e2d2e3 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Wed, 9 Mar 2022 22:17:07 +0100
Subject: [PATCH 3/4] Add brackets

---
 pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
index ca0e92947f160..0f5ae06106f9b 100644
--- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
+++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -360,7 +360,7 @@ def test_dtypes_defaultdict_mangle_dup_cols(all_parsers):
     dtype["b.1"] = "int64"
     parser = all_parsers
     result = parser.read_csv(StringIO(data), dtype=dtype)
-    expected = DataFrame({"a": [1], "b": 2.0, "a.1": [3], "b.2": [4.0], "b.1": [5]})
+    expected = DataFrame({"a": [1], "b": [2.0], "a.1": [3], "b.2": [4.0], "b.1": [5]})
     tm.assert_frame_equal(result, expected)
 
 

From cdf7f2f613a1a8492d3b25f121f61529899cb7d2 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Wed, 9 Mar 2022 22:22:40 +0100
Subject: [PATCH 4/4] Fix mypy

---
 pandas/io/parsers/base_parser.py      | 5 +++--
 pandas/io/parsers/c_parser_wrapper.py | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index ca46af29543c1..a3344220122bb 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -466,9 +466,10 @@ def _clean_mapping(self, mapping):
         if not isinstance(mapping, dict):
             return mapping
         clean = {}
+        # for mypy
+        assert self.orig_names is not None
+
         for col, v in mapping.items():
-            # for mypy
-            assert self.orig_names is not None
             if isinstance(col, int) and col not in self.orig_names:
                 col = self.orig_names[col]
             clean[col] = v
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
index 5801fab4a7e76..5facae764bde5 100644
--- a/pandas/io/parsers/c_parser_wrapper.py
+++ b/pandas/io/parsers/c_parser_wrapper.py
@@ -417,8 +417,9 @@ def ensure_dtype_objs(
     dtype objects.
     """
     if isinstance(dtype, defaultdict):
-        default_dtype = pandas_dtype(dtype.default_factory())
-        dtype_converted = defaultdict(lambda: default_dtype)
+        # "None" not callable  [misc]
+        default_dtype = pandas_dtype(dtype.default_factory())  # type: ignore[misc]
+        dtype_converted: defaultdict = defaultdict(lambda: default_dtype)
         for key in dtype.keys():
             dtype_converted[key] = pandas_dtype(dtype[key])
         return dtype_converted