From 3bab38b122b3f0c8fe9fb37a881e9b0b3e006f82 Mon Sep 17 00:00:00 2001 From: Aliebc Date: Sun, 23 Jun 2024 14:41:21 +0800 Subject: [PATCH 01/17] BUG: Add type check for encoding_errors in pd.read_csv --- pandas/io/parsers/readers.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 8a07c99b0fe94..8c922f5f527af 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1413,6 +1413,13 @@ def _make_engine( raise ValueError( f"Unknown engine: {engine} (valid options are {mapping.keys()})" ) + + errors = self.options.get("encoding_errors", "strict") + if not isinstance(errors, str) and errors is not None: + raise ValueError( + f"encoding_errors must be a string, got {type(errors).__name__}" + ) + if not isinstance(f, list): # open file here is_text = True @@ -1437,7 +1444,7 @@ def _make_engine( compression=self.options.get("compression", None), memory_map=self.options.get("memory_map", False), is_text=is_text, - errors=self.options.get("encoding_errors", "strict"), + errors=errors, storage_options=self.options.get("storage_options", None), ) assert self.handles is not None From 31444ea96b423ccf744465007e3e444cb52ade6a Mon Sep 17 00:00:00 2001 From: Aliebc Date: Tue, 25 Jun 2024 10:25:27 +0800 Subject: [PATCH 02/17] BUG: Add type check for encoding_errors in pd.read_csv --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/parsers/readers.py | 16 +++++++++------- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 3d869bf31f372..65e0016fc0226 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -554,6 +554,7 @@ I/O - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) +- Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string or None. (:issue:`59059`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) Period diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 8c922f5f527af..80fd53583ad14 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -674,6 +674,14 @@ def _read( # Extract some of the arguments (pass chunksize on). iterator = kwds.get("iterator", False) chunksize = kwds.get("chunksize", None) + + # Check type of encoding_errors + errors = kwds.get("encoding_errors", "strict") + if not isinstance(errors, str) and errors is not None: + raise ValueError( + f"encoding_errors must be a string or None, got {type(errors).__name__}" + ) + if kwds.get("engine") == "pyarrow": if iterator: raise ValueError( @@ -1414,12 +1422,6 @@ def _make_engine( f"Unknown engine: {engine} (valid options are {mapping.keys()})" ) - errors = self.options.get("encoding_errors", "strict") - if not isinstance(errors, str) and errors is not None: - raise ValueError( - f"encoding_errors must be a string, got {type(errors).__name__}" - ) - if not isinstance(f, list): # open file here is_text = True @@ -1444,7 +1446,7 @@ def _make_engine( compression=self.options.get("compression", None), memory_map=self.options.get("memory_map", False), is_text=is_text, - errors=errors, + errors=self.options.get("encoding_errors", "strict"), storage_options=self.options.get("storage_options", None), ) assert self.handles is not None From 54347277ab641ad19fd8731f9feccb9ea1a0ff67 Mon Sep 17 00:00:00 2001 From: Aliebc Date: Tue, 25 Jun 2024 10:40:34 +0800 Subject: [PATCH 03/17] pre-commit --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/io/parsers/readers.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 5e4eeb1266492..1ff0d38894a0f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -554,9 +554,9 @@ I/O - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) +- Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string or None. (:issue:`59059`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) -- Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string or None. (:issue:`59059`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) Period diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 2c56e9feddeb0..e02518c11e9c1 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -674,14 +674,14 @@ def _read( # Extract some of the arguments (pass chunksize on). iterator = kwds.get("iterator", False) chunksize = kwds.get("chunksize", None) - + # Check type of encoding_errors errors = kwds.get("encoding_errors", "strict") if not isinstance(errors, str) and errors is not None: raise ValueError( f"encoding_errors must be a string or None, got {type(errors).__name__}" ) - + if kwds.get("engine") == "pyarrow": if iterator: raise ValueError( From f948b6267bcbc90499e64fe94c8a98f1fc30bfc1 Mon Sep 17 00:00:00 2001 From: AX Date: Wed, 26 Jun 2024 01:25:06 +0800 Subject: [PATCH 04/17] Update pandas/io/parsers/readers.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/io/parsers/readers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index e02518c11e9c1..d41106e509329 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1421,7 +1421,6 @@ def _make_engine( raise ValueError( f"Unknown engine: {engine} (valid options are {mapping.keys()})" ) - if not isinstance(f, list): # open file here is_text = True From d226cc25e9abd81f6d8bf10d7878936101fa09e9 Mon Sep 17 00:00:00 2001 From: AX Date: Wed, 26 Jun 2024 01:30:04 +0800 Subject: [PATCH 05/17] Unit test Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/io/parsers/readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index d41106e509329..770b02b1a18c9 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -677,7 +677,7 @@ def _read( # Check type of encoding_errors errors = kwds.get("encoding_errors", "strict") - if not isinstance(errors, str) and errors is not None: + if not isinstance(errors, str): raise ValueError( f"encoding_errors must be a string or None, got {type(errors).__name__}" ) From d9f5583eac5797b220c9a0b44fff808c3c205642 Mon Sep 17 00:00:00 2001 From: "Y.X" Date: Wed, 26 Jun 2024 11:09:03 +0800 Subject: [PATCH 06/17] Update pandas/io/parsers/readers.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/io/parsers/readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 770b02b1a18c9..c28d3aaaf4748 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -679,7 +679,7 @@ def _read( errors = kwds.get("encoding_errors", "strict") if not isinstance(errors, str): raise ValueError( - f"encoding_errors must be a string or None, got {type(errors).__name__}" + f"encoding_errors must be a string, got {type(errors).__name__}" ) if kwds.get("engine") == "pyarrow": From 1305cfd8f46a2129fc6f0526b704ff9f61ead83d Mon Sep 17 00:00:00 2001 From: Aliebc Date: Wed, 26 Jun 2024 11:49:13 +0800 Subject: [PATCH 07/17] update the unit test for `encoding_errors` --- pandas/tests/io/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index d73790365bb1f..8d3fb5ceaf397 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -555,7 +555,7 @@ def test_explicit_encoding(io_class, mode, msg): expected.to_csv(buffer, mode=f"w{mode}") -@pytest.mark.parametrize("encoding_errors", [None, "strict", "replace"]) +@pytest.mark.parametrize("encoding_errors", ["strict", "replace"]) @pytest.mark.parametrize("format", ["csv", "json"]) def test_encoding_errors(encoding_errors, format): # GH39450 From efec1b5f08eb7e0b2d2b78b641525e7a2376a8a6 Mon Sep 17 00:00:00 2001 From: "Y.X" Date: Wed, 26 Jun 2024 11:57:17 +0800 Subject: [PATCH 08/17] Update doc/source/whatsnew/v3.0.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 1ff0d38894a0f..37de212a63462 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -554,7 +554,7 @@ I/O - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) -- Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string or None. (:issue:`59059`) +- Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) From be5215a1ac247bd777348b77b26957b17328be2a Mon Sep 17 00:00:00 2001 From: Aliebc Date: Thu, 27 Jun 2024 13:03:35 +0800 Subject: [PATCH 09/17] add a unit test --- pandas/tests/io/test_common.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 8d3fb5ceaf397..3ca835cbe9f82 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -589,6 +589,21 @@ def test_encoding_errors(encoding_errors, format): expected = pd.DataFrame({decoded: [decoded]}, index=[decoded * 2]) tm.assert_frame_equal(df, expected) +@pytest.mark.parametrize("encoding_errors", [0, None, "strict"]) +@pytest.mark.parametrize("format", ["csv"]) +def test_encoding_errors_badtype(encoding_errors, format): + # GH 59075 + with tm.ensure_clean() as path: + if format == "csv": + content = StringIO("A,B\n1,2\n3,4\n") + reader = partial(pd.read_csv, encoding_errors=encoding_errors) + if encoding_errors != "strict": + with pytest.raises(TypeError, match=f"encoding_errors must be a string, got {type(encoding_errors).__name__}"): + reader(content) + else: + df = reader(content) + expected = pd.DataFrame({"A": [1, 3], "B": [2, 4]}) + tm.assert_frame_equal(df, expected) def test_bad_encdoing_errors(): # GH 39777 From 641163172347e4d80549da24b0d2780d5c2dc808 Mon Sep 17 00:00:00 2001 From: Aliebc Date: Thu, 27 Jun 2024 13:11:10 +0800 Subject: [PATCH 10/17] update unit test --- pandas/tests/io/test_common.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 3ca835cbe9f82..57f0f70ac606b 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -593,17 +593,18 @@ def test_encoding_errors(encoding_errors, format): @pytest.mark.parametrize("format", ["csv"]) def test_encoding_errors_badtype(encoding_errors, format): # GH 59075 - with tm.ensure_clean() as path: - if format == "csv": - content = StringIO("A,B\n1,2\n3,4\n") - reader = partial(pd.read_csv, encoding_errors=encoding_errors) - if encoding_errors != "strict": - with pytest.raises(TypeError, match=f"encoding_errors must be a string, got {type(encoding_errors).__name__}"): - reader(content) - else: - df = reader(content) - expected = pd.DataFrame({"A": [1, 3], "B": [2, 4]}) - tm.assert_frame_equal(df, expected) + if format == "csv": + content = StringIO("A,B\n1,2\n3,4\n") + reader = partial(pd.read_csv, encoding_errors=encoding_errors) + if encoding_errors != "strict": + with pytest.raises(TypeError, + match=f"encoding_errors must be a string, got {type(encoding_errors).__name__}" + ): + reader(content) + else: + df = reader(content) + expected = pd.DataFrame({"A": [1, 3], "B": [2, 4]}) + tm.assert_frame_equal(df, expected) def test_bad_encdoing_errors(): # GH 39777 From 984e5b80863efe9bdce78f3d9f5f3edd1b4174c0 Mon Sep 17 00:00:00 2001 From: Aliebc Date: Thu, 27 Jun 2024 13:14:11 +0800 Subject: [PATCH 11/17] update unit test --- pandas/tests/io/test_common.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 57f0f70ac606b..aa8532861b840 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -597,9 +597,9 @@ def test_encoding_errors_badtype(encoding_errors, format): content = StringIO("A,B\n1,2\n3,4\n") reader = partial(pd.read_csv, encoding_errors=encoding_errors) if encoding_errors != "strict": - with pytest.raises(TypeError, - match=f"encoding_errors must be a string, got {type(encoding_errors).__name__}" - ): + expected_error = "encoding_errors must be a string, got " + expected_error += f"{type(encoding_errors).__name__}" + with pytest.raises(TypeError, match=expected_error): reader(content) else: df = reader(content) From e9f9fa549bd97c7281f8d70e9ebfa667165ee42e Mon Sep 17 00:00:00 2001 From: Aliebc Date: Thu, 27 Jun 2024 13:16:41 +0800 Subject: [PATCH 12/17] update unit test --- pandas/tests/io/test_common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index aa8532861b840..9a1b4c0129357 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -589,6 +589,7 @@ def test_encoding_errors(encoding_errors, format): expected = pd.DataFrame({decoded: [decoded]}, index=[decoded * 2]) tm.assert_frame_equal(df, expected) + @pytest.mark.parametrize("encoding_errors", [0, None, "strict"]) @pytest.mark.parametrize("format", ["csv"]) def test_encoding_errors_badtype(encoding_errors, format): @@ -606,6 +607,7 @@ def test_encoding_errors_badtype(encoding_errors, format): expected = pd.DataFrame({"A": [1, 3], "B": [2, 4]}) tm.assert_frame_equal(df, expected) + def test_bad_encdoing_errors(): # GH 39777 with tm.ensure_clean() as path: From 6f813b02b8e39be0d53292f9574b2d4636f27dc1 Mon Sep 17 00:00:00 2001 From: Aliebc Date: Thu, 27 Jun 2024 14:55:08 +0800 Subject: [PATCH 13/17] update unit test --- pandas/tests/io/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 9a1b4c0129357..4ee1fcb3a9b02 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -600,7 +600,7 @@ def test_encoding_errors_badtype(encoding_errors, format): if encoding_errors != "strict": expected_error = "encoding_errors must be a string, got " expected_error += f"{type(encoding_errors).__name__}" - with pytest.raises(TypeError, match=expected_error): + with pytest.raises(ValueError, match=expected_error): reader(content) else: df = reader(content) From 5fea3826ad54d950cd8a35100b2d4323543778c1 Mon Sep 17 00:00:00 2001 From: "Y.X" Date: Thu, 27 Jun 2024 23:16:59 +0800 Subject: [PATCH 14/17] Update pandas/tests/io/test_common.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/io/test_common.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 4ee1fcb3a9b02..cedc01a152050 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -591,7 +591,6 @@ def test_encoding_errors(encoding_errors, format): @pytest.mark.parametrize("encoding_errors", [0, None, "strict"]) -@pytest.mark.parametrize("format", ["csv"]) def test_encoding_errors_badtype(encoding_errors, format): # GH 59075 if format == "csv": From 44baa46ae764550ad893e6bb85d0acf58fb311a3 Mon Sep 17 00:00:00 2001 From: "Y.X" Date: Thu, 27 Jun 2024 23:17:14 +0800 Subject: [PATCH 15/17] Update pandas/tests/io/test_common.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/io/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index cedc01a152050..71dbcb6073639 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -591,7 +591,7 @@ def test_encoding_errors(encoding_errors, format): @pytest.mark.parametrize("encoding_errors", [0, None, "strict"]) -def test_encoding_errors_badtype(encoding_errors, format): +def test_encoding_errors_badtype(encoding_errors): # GH 59075 if format == "csv": content = StringIO("A,B\n1,2\n3,4\n") From 5e08d4d83c8819b0f36a9a024dc3276f1a190abe Mon Sep 17 00:00:00 2001 From: Aliebc Date: Thu, 27 Jun 2024 23:18:19 +0800 Subject: [PATCH 16/17] update unit test --- pandas/tests/io/test_common.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 71dbcb6073639..a4221cb403ef2 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -590,21 +590,20 @@ def test_encoding_errors(encoding_errors, format): tm.assert_frame_equal(df, expected) -@pytest.mark.parametrize("encoding_errors", [0, None, "strict"]) +@pytest.mark.parametrize("encoding_errors", [0, None]) def test_encoding_errors_badtype(encoding_errors): # GH 59075 - if format == "csv": - content = StringIO("A,B\n1,2\n3,4\n") - reader = partial(pd.read_csv, encoding_errors=encoding_errors) - if encoding_errors != "strict": - expected_error = "encoding_errors must be a string, got " - expected_error += f"{type(encoding_errors).__name__}" - with pytest.raises(ValueError, match=expected_error): - reader(content) - else: - df = reader(content) - expected = pd.DataFrame({"A": [1, 3], "B": [2, 4]}) - tm.assert_frame_equal(df, expected) + content = StringIO("A,B\n1,2\n3,4\n") + reader = partial(pd.read_csv, encoding_errors=encoding_errors) + if encoding_errors != "strict": + expected_error = "encoding_errors must be a string, got " + expected_error += f"{type(encoding_errors).__name__}" + with pytest.raises(ValueError, match=expected_error): + reader(content) + else: + df = reader(content) + expected = pd.DataFrame({"A": [1, 3], "B": [2, 4]}) + tm.assert_frame_equal(df, expected) def test_bad_encdoing_errors(): From d422b5e93481d5176f65eb9ac5af782df973c2f0 Mon Sep 17 00:00:00 2001 From: Aliebc Date: Thu, 27 Jun 2024 23:21:34 +0800 Subject: [PATCH 17/17] update unit test --- pandas/tests/io/test_common.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index a4221cb403ef2..26bb2be73838a 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -595,15 +595,10 @@ def test_encoding_errors_badtype(encoding_errors): # GH 59075 content = StringIO("A,B\n1,2\n3,4\n") reader = partial(pd.read_csv, encoding_errors=encoding_errors) - if encoding_errors != "strict": - expected_error = "encoding_errors must be a string, got " - expected_error += f"{type(encoding_errors).__name__}" - with pytest.raises(ValueError, match=expected_error): - reader(content) - else: - df = reader(content) - expected = pd.DataFrame({"A": [1, 3], "B": [2, 4]}) - tm.assert_frame_equal(df, expected) + expected_error = "encoding_errors must be a string, got " + expected_error += f"{type(encoding_errors).__name__}" + with pytest.raises(ValueError, match=expected_error): + reader(content) def test_bad_encdoing_errors():