From b07a43285607dc9c60ea1b1977956ef438f1653e Mon Sep 17 00:00:00 2001 From: Justin McOmie Date: Tue, 13 Jul 2021 09:55:18 -0700 Subject: [PATCH 1/2] BUG: Fix multi-index colname references in read_csv c engine. This fixes an issue with the read_csv c engine when the input has more than one header row and arguments to dtype, na_values, or converters reference multi-index column names as tuples. --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/_libs/parsers.pyx | 5 +++- .../io/parser/dtypes/test_dtypes_basic.py | 25 +++++++++++++++++++ pandas/tests/io/parser/test_converters.py | 25 +++++++++++++++++++ pandas/tests/io/parser/test_na_values.py | 19 ++++++++++++++ 5 files changed, 74 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index cb2a59860783f..4f57597432e2c 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -230,7 +230,7 @@ MultiIndex I/O ^^^ - Bug in :func:`read_excel` attempting to read chart sheets from .xlsx files (:issue:`41448`) -- +- Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`) - Period diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index e5e61e409c320..d97d65d61e30d 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1281,7 +1281,10 @@ cdef class TextReader: if j >= len(self.header[0]): return j else: - return self.header[0][j] + if self.has_mi_columns: + return tuple(header_row[j] for header_row in self.header) + else: + return self.header[0][j] else: return None diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index bc20f1d1eea5f..c1e6f7b343e8f 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -257,3 +257,28 @@ def test_dtype_mangle_dup_cols_single_dtype(all_parsers): result = parser.read_csv(StringIO(data), dtype=str) expected = DataFrame({"a": ["1"], "a.1": ["1"]}) tm.assert_frame_equal(result, expected) + + +def test_dtype_multi_index(all_parsers): + parser = all_parsers + data = "A,B,B\nX,Y,Z\n1,2,3" + + result = parser.read_csv( + StringIO(data), + header=list(range(2)), + dtype={ + ("A", "X"): np.int32, + ("B", "Y"): np.int32, + ("B", "Z"): np.float32, + }, + ) + + expected = DataFrame( + { + ("A", "X"): np.int32([1]), + ("B", "Y"): np.int32([2]), + ("B", "Z"): np.float32([3]), + } + ) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index ffa6c8259a59e..de00d2f25012e 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -161,3 +161,28 @@ def test_converter_index_col_bug(all_parsers): xp = DataFrame({"B": [2, 4]}, index=Index([1, 3], name="A")) tm.assert_frame_equal(rs, xp) + + +def test_converter_multi_index(all_parsers): + parser = all_parsers + data = "A,B,B\nX,Y,Z\n1,2,3" + + result = parser.read_csv( + StringIO(data), + header=list(range(2)), + converters={ + ("A", "X"): np.int32, + ("B", "Y"): np.int32, + ("B", "Z"): np.float32, + }, + ) + + expected = DataFrame( + { + ("A", "X"): np.int32([1]), + ("B", "Y"): np.int32([2]), + ("B", "Z"): np.float32([3]), + } + ) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 6e445f6813310..1d232eeca296c 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -570,3 +570,22 @@ def test_str_nan_dropped(all_parsers): ) tm.assert_frame_equal(result, expected) + + +def test_nan_multi_index(all_parsers): + parser = all_parsers + data = "A,B,B\nX,Y,Z\n1,2,inf" + + result = parser.read_csv( + StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"} + ) + + expected = DataFrame( + { + ("A", "X"): [1], + ("B", "Y"): [2], + ("B", "Z"): [np.nan], + } + ) + + tm.assert_frame_equal(result, expected) From c38b75e07b5f12f0363b3f6de950578c358e7420 Mon Sep 17 00:00:00 2001 From: Justin McOmie Date: Tue, 13 Jul 2021 17:49:47 -0700 Subject: [PATCH 2/2] Updates for CR: comment, if block consolidation. --- pandas/_libs/parsers.pyx | 7 +++---- pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 1 + pandas/tests/io/parser/test_converters.py | 1 + pandas/tests/io/parser/test_na_values.py | 1 + 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index d97d65d61e30d..3655d6efad66e 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1280,11 +1280,10 @@ cdef class TextReader: # generate extra (bogus) headers if there are more columns than headers if j >= len(self.header[0]): return j + elif self.has_mi_columns: + return tuple(header_row[j] for header_row in self.header) else: - if self.has_mi_columns: - return tuple(header_row[j] for header_row in self.header) - else: - return self.header[0][j] + return self.header[0][j] else: return None diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index c1e6f7b343e8f..6ed52ed86af2a 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -260,6 +260,7 @@ def test_dtype_mangle_dup_cols_single_dtype(all_parsers): def test_dtype_multi_index(all_parsers): + # GH 42446 parser = all_parsers data = "A,B,B\nX,Y,Z\n1,2,3" diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index de00d2f25012e..78b64baab4dc0 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -164,6 +164,7 @@ def test_converter_index_col_bug(all_parsers): def test_converter_multi_index(all_parsers): + # GH 42446 parser = all_parsers data = "A,B,B\nX,Y,Z\n1,2,3" diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 1d232eeca296c..2880bf8690b46 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -573,6 +573,7 @@ def test_str_nan_dropped(all_parsers): def test_nan_multi_index(all_parsers): + # GH 42446 parser = all_parsers data = "A,B,B\nX,Y,Z\n1,2,inf"