From 2f40db3b841f998738ed1d4d4740461cf6474254 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Tue, 20 Dec 2022 18:11:09 +0100 Subject: [PATCH 01/18] BUG: str dtype ignored for column with dot I --- pandas/io/parsers/python_parser.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index aebf285e669bb..e2d846cc3109c 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -875,6 +875,12 @@ def _search_replace_num_columns( or search not in x or (self._no_thousands_columns and i in self._no_thousands_columns) or not self.num.search(x.strip()) + or ( + self.columns + and self.dtype + and self.columns[i] in self.dtype + and self.dtype[self.columns[i]] is str + ) ): rl.append(x) else: From b032e1854f41d407f3dd610ecfb32c93baca2779 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Thu, 22 Dec 2022 11:05:08 +0100 Subject: [PATCH 02/18] BUG: add test to str dtype ignored for column with dot I --- .../io/parser/common/test_common_basic.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 6656face3be84..75e7d58b3a5ea 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -127,6 +127,25 @@ def test_1000_sep(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow +def test__search_replace_num_columns(all_parsers): + parser = all_parsers + data = """A;B;C +155.75;3.001;43.06.4 +16.2;0;23.0 +""" + expected = DataFrame( + { + "A": ["155.75", "16.2"], + "B": [3001, 0], + "C": [43064, 230], + } + ) + + result = parser.read_csv(StringIO(data), sep=";", dtype={"A": str}, thousands=".") + tm.assert_frame_equal(result, expected) + + @xfail_pyarrow def test_unnamed_columns(all_parsers): data = """A,B,C,, From ea1865d346f02c0aa54c366a6c4bfa79051fff08 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Thu, 22 Dec 2022 11:35:08 +0100 Subject: [PATCH 03/18] BUG: str dtype ignored for column with dot III --- pandas/io/parsers/python_parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index e2d846cc3109c..3b1f3e01ec687 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -34,6 +34,7 @@ ParserError, ) +from pandas.api.types import is_numeric_dtype from pandas.core.dtypes.common import is_integer from pandas.core.dtypes.inference import is_dict_like @@ -879,7 +880,7 @@ def _search_replace_num_columns( self.columns and self.dtype and self.columns[i] in self.dtype - and self.dtype[self.columns[i]] is str + and not is_numeric_dtype(self.dtype.get(self.columns[i])) ) ): rl.append(x) From 1554f753aba8fb1aecd4fb5cf1cef82ca8c7956f Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Thu, 22 Dec 2022 11:48:39 +0100 Subject: [PATCH 04/18] BUG: str dtype ignored for column with dot IV --- pandas/io/parsers/python_parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 3b1f3e01ec687..c449f827ea6e4 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -34,10 +34,11 @@ ParserError, ) -from pandas.api.types import is_numeric_dtype from pandas.core.dtypes.common import is_integer from pandas.core.dtypes.inference import is_dict_like +from pandas.api.types import is_numeric_dtype + from pandas.io.parsers.base_parser import ( ParserBase, parser_defaults, From 1e09dfbe285f267a08ca099c80c40a304a074adb Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Mon, 2 Jan 2023 10:05:04 +0100 Subject: [PATCH 05/18] BUG: str dtype ignored for column with dot V --- pandas/io/parsers/python_parser.py | 45 ++++++++++++------- .../io/parser/test_python_parser_only.py | 45 +++++++++++++++++++ 2 files changed, 75 insertions(+), 15 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index c449f827ea6e4..1ddb7b4f9f82a 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -34,11 +34,12 @@ ParserError, ) -from pandas.core.dtypes.common import is_integer +from pandas.core.dtypes.common import ( + is_integer, + is_numeric_dtype, +) from pandas.core.dtypes.inference import is_dict_like -from pandas.api.types import is_numeric_dtype - from pandas.io.parsers.base_parser import ( ParserBase, parser_defaults, @@ -153,12 +154,7 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: self._col_indices = list(range(len(self.columns))) self._parse_date_cols = self._validate_parse_dates_presence(self.columns) - no_thousands_columns: set[int] | None = None - if self.parse_dates: - no_thousands_columns = self._set_noconvert_dtype_columns( - self._col_indices, self.columns - ) - self._no_thousands_columns = no_thousands_columns + self._no_thousands_columns = self._set_no_thousand_columns() if len(self.decimal) != 1: raise ValueError("Only length-1 decimal markers supported") @@ -877,12 +873,6 @@ def _search_replace_num_columns( or search not in x or (self._no_thousands_columns and i in self._no_thousands_columns) or not self.num.search(x.strip()) - or ( - self.columns - and self.dtype - and self.columns[i] in self.dtype - and not is_numeric_dtype(self.dtype.get(self.columns[i])) - ) ): rl.append(x) else: @@ -1156,6 +1146,31 @@ def _remove_skipped_rows(self, new_rows: list[list[Scalar]]) -> list[list[Scalar ] return new_rows + def _set_no_thousand_columns(self) -> set[int]: + noconvert_columns = set() + if self.parse_dates: + noconvert_columns = self._set_noconvert_dtype_columns( + self._col_indices, self.names + ) + + if self.columns and self.dtype: + if isinstance(self.dtype, dict): + for i in self._col_indices: + if ( + not is_numeric_dtype(self.dtype.get(self.columns[i], None)) + and self.columns[i] in self.dtype + ): + noconvert_columns.add(i) + else: + for i in self._col_indices: + if ( + not is_numeric_dtype(self.dtype) + and self.columns[i] in self.dtype + ): + noconvert_columns.add(i) + + return noconvert_columns + class FixedWidthReader(abc.Iterator): """ diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 5f067b205a72d..dea60981e13e9 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -489,3 +489,48 @@ def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parse ) expected = DataFrame({"a": ["a", "c", "f"]}) tm.assert_frame_equal(result, expected) + + +def test_no_thousand_convert_for_non_numeric_cols(python_parser_only): + # GH#50270 + parser = python_parser_only + data = """a;b;c +0000,7995;16,000.1;0 +3,03,001,00514;0;4,001 +4923,600,041;23,000;131 +""" + + result = parser.read_csv( + StringIO(data), sep=";", dtype={"a": str, "b": float, "c": int}, thousands="," + ) + expected = DataFrame( + { + "a": ["0000,7995", "3,03,001,00514", "4923,600,041"], + "b": [16000.1, 0, 23000], + "c": [0, 4001, 131], + } + ) + tm.assert_frame_equal(result, expected) + + result2 = parser.read_csv( + StringIO(data), sep=";", dtype={"a": str, "b": str, "c": str}, thousands="," + ) + expected2 = DataFrame( + { + "a": ["0000,7995", "3,03,001,00514", "4923,600,041"], + "b": ["16,000.1", "0", "23,000"], + "c": ["0", "4,001", "131"], + } + ) + tm.assert_frame_equal(result2, expected2) + + result3 = parser.read_csv(StringIO(data), sep=";", dtype=float, thousands=",") + expected3 = DataFrame( + { + "a": [7995, 30300100514, 4923600041], + "b": [16000.1, 0, 23000], + "c": [0, 4001, 131], + }, + dtype=float, + ) + tm.assert_frame_equal(result3, expected3) From 615a7224d67a51105cc2f562bdad3566562cdaf2 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Thu, 5 Jan 2023 15:26:53 +0100 Subject: [PATCH 06/18] BUG: str dtype ignored for column with dot VI --- pandas/io/parsers/python_parser.py | 33 +++++++------------ .../io/parser/common/test_common_basic.py | 4 ++- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 1ddb7b4f9f82a..c39b6acff0888 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1147,29 +1147,20 @@ def _remove_skipped_rows(self, new_rows: list[list[Scalar]]) -> list[list[Scalar return new_rows def _set_no_thousand_columns(self) -> set[int]: - noconvert_columns = set() - if self.parse_dates: - noconvert_columns = self._set_noconvert_dtype_columns( - self._col_indices, self.names + no_thousands_columns: set[int] | None = None + if self.columns and self.parse_dates: + no_thousands_columns = self._set_noconvert_dtype_columns( + self._col_indices, self.columns ) - if self.columns and self.dtype: - if isinstance(self.dtype, dict): - for i in self._col_indices: - if ( - not is_numeric_dtype(self.dtype.get(self.columns[i], None)) - and self.columns[i] in self.dtype - ): - noconvert_columns.add(i) - else: - for i in self._col_indices: - if ( - not is_numeric_dtype(self.dtype) - and self.columns[i] in self.dtype - ): - noconvert_columns.add(i) - - return noconvert_columns + if no_thousands_columns is None: + no_thousands_columns = set() + for i in self._col_indices: + if isinstance(self.dtype, dict) and not is_numeric_dtype( + self.dtype.get(self.columns[i], None) + ): + no_thousands_columns.add(i) + return no_thousands_columns class FixedWidthReader(abc.Iterator): diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 75e7d58b3a5ea..cdbfbc3895dcf 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -142,7 +142,9 @@ def test__search_replace_num_columns(all_parsers): } ) - result = parser.read_csv(StringIO(data), sep=";", dtype={"A": str}, thousands=".") + result = parser.read_csv( + StringIO(data), sep=";", dtype={"A": str, "B": int, "C": int}, thousands="." + ) tm.assert_frame_equal(result, expected) From 4ac9b77f59261b5f8a7aebca798717b13ae988bd Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Sun, 29 Jan 2023 17:23:05 +0100 Subject: [PATCH 07/18] TEST: added assert for mypy --- pandas/io/parsers/python_parser.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index b31070de49a00..790986e265fc2 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1168,12 +1168,14 @@ def _remove_skipped_rows(self, new_rows: list[list[Scalar]]) -> list[list[Scalar def _set_no_thousand_columns(self) -> set[int]: no_thousands_columns: set[int] | None = None if self.columns and self.parse_dates: + assert self._col_indices is not None no_thousands_columns = self._set_noconvert_dtype_columns( self._col_indices, self.columns ) if self.columns and self.dtype: if no_thousands_columns is None: no_thousands_columns = set() + assert self._col_indices is not None for i in self._col_indices: if isinstance(self.dtype, dict) and not is_numeric_dtype( self.dtype.get(self.columns[i], None) From 3bf5e1b09a29e2d275ed00e7c5923a96592109b2 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Sun, 29 Jan 2023 19:26:10 +0100 Subject: [PATCH 08/18] TEST: added assert for mypy II --- pandas/io/parsers/python_parser.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 790986e265fc2..610507c8b6672 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1166,15 +1166,13 @@ def _remove_skipped_rows(self, new_rows: list[list[Scalar]]) -> list[list[Scalar return new_rows def _set_no_thousand_columns(self) -> set[int]: - no_thousands_columns: set[int] | None = None + no_thousands_columns: set[int] = set() if self.columns and self.parse_dates: assert self._col_indices is not None no_thousands_columns = self._set_noconvert_dtype_columns( self._col_indices, self.columns ) if self.columns and self.dtype: - if no_thousands_columns is None: - no_thousands_columns = set() assert self._col_indices is not None for i in self._col_indices: if isinstance(self.dtype, dict) and not is_numeric_dtype( From 8ea72940509b4a5d00ad23febcdf0d8d830203f6 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Sat, 4 Feb 2023 14:06:38 +0100 Subject: [PATCH 09/18] BUG: str dtype ignored for column with dot VII --- pandas/io/parsers/python_parser.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 610507c8b6672..3b6d1cc801579 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1175,10 +1175,16 @@ def _set_no_thousand_columns(self) -> set[int]: if self.columns and self.dtype: assert self._col_indices is not None for i in self._col_indices: + if not isinstance(self.dtype, dict) and not is_numeric_dtype( + self.dtype + ): + no_thousands_columns.add(i) if isinstance(self.dtype, dict) and not is_numeric_dtype( self.dtype.get(self.columns[i], None) ): - no_thousands_columns.add(i) + for key in self.dtype: + if key == self.columns[i]: + no_thousands_columns.add(i) return no_thousands_columns From 6e5c5fb5e6e176e669f08bbff71e6690000b4df7 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Sat, 11 Feb 2023 17:00:43 +0100 Subject: [PATCH 10/18] specify int64 explicitly --- pandas/tests/io/parser/common/test_common_basic.py | 2 +- pandas/tests/io/parser/test_python_parser_only.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index cdbfbc3895dcf..858a0124c618a 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -143,7 +143,7 @@ def test__search_replace_num_columns(all_parsers): ) result = parser.read_csv( - StringIO(data), sep=";", dtype={"A": str, "B": int, "C": int}, thousands="." + StringIO(data), sep=";", dtype={"A": str, "B": np.int64, "C": np.int64}, thousands="." ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index ecf24cee8af63..a19765749b2c1 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -14,6 +14,7 @@ ) from typing import Iterator +import numpy as np import pytest from pandas.errors import ( @@ -500,7 +501,7 @@ def test_no_thousand_convert_for_non_numeric_cols(python_parser_only): """ result = parser.read_csv( - StringIO(data), sep=";", dtype={"a": str, "b": float, "c": int}, thousands="," + StringIO(data), sep=";", dtype={"a": str, "b": float, "c": np.int64}, thousands="," ) expected = DataFrame( { From f40184ce953506a0bcbf9580995c701aa7a129fc Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Sat, 11 Feb 2023 20:38:20 +0100 Subject: [PATCH 11/18] specify int64 explicitly II --- pandas/tests/io/parser/common/test_common_basic.py | 5 ++++- pandas/tests/io/parser/test_python_parser_only.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 858a0124c618a..d6926a1f76ca6 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -143,7 +143,10 @@ def test__search_replace_num_columns(all_parsers): ) result = parser.read_csv( - StringIO(data), sep=";", dtype={"A": str, "B": np.int64, "C": np.int64}, thousands="." + StringIO(data), + sep=";", + dtype={"A": str, "B": np.int64, "C": np.int64}, + thousands=".", ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index a19765749b2c1..5ff83f777292b 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -501,7 +501,10 @@ def test_no_thousand_convert_for_non_numeric_cols(python_parser_only): """ result = parser.read_csv( - StringIO(data), sep=";", dtype={"a": str, "b": float, "c": np.int64}, thousands="," + StringIO(data), + sep=";", + dtype={"a": str, "b": float, "c": np.int64}, + thousands=",", ) expected = DataFrame( { From 8fd2e692de9b694ef1aaead189c713def3b4b534 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Mon, 13 Feb 2023 12:27:46 +0100 Subject: [PATCH 12/18] add the original example and remove the redundant check --- pandas/io/parsers/python_parser.py | 2 +- .../io/parser/test_python_parser_only.py | 25 ++++++++++++++++++- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index b455e2e1fbc20..f2511ae50a998 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -889,7 +889,7 @@ def _search_replace_num_columns( if ( not isinstance(x, str) or search not in x - or (self._no_thousands_columns and i in self._no_thousands_columns) + or i in self._no_thousands_columns or not self.num.search(x.strip()) ): rl.append(x) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 5ff83f777292b..f570ab6c9a08d 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -499,7 +499,6 @@ def test_no_thousand_convert_for_non_numeric_cols(python_parser_only): 3,03,001,00514;0;4,001 4923,600,041;23,000;131 """ - result = parser.read_csv( StringIO(data), sep=";", @@ -537,3 +536,27 @@ def test_no_thousand_convert_for_non_numeric_cols(python_parser_only): dtype=float, ) tm.assert_frame_equal(result3, expected3) + + +def test_no_thousand_with_dot_convert_for_non_numeric_cols(python_parser_only): + # GH#50270 + parser = python_parser_only + data = """a;b;c +0000.7995;16.000;0 +3.03.001.00514;0;4.000 +4923.600.041;23.000;131 +""" + result = parser.read_csv( + StringIO(data), + sep=";", + dtype={"a": str}, + thousands=".", + ) + expected = DataFrame( + { + "a": ["0000.7995", "3.03.001.00514", "4923.600.041"], + "b": [16000, 0, 23000], + "c": [0, 4000, 131], + } + ) + tm.assert_frame_equal(result, expected) From c71e5b0cbde78322f0d2ada299a523cba1e78773 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Mon, 13 Feb 2023 22:14:42 +0100 Subject: [PATCH 13/18] remove unnecessary check --- pandas/io/parsers/python_parser.py | 4 +--- pandas/tests/io/parser/test_python_parser_only.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index f2511ae50a998..aaaa9ffadb8a8 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1181,9 +1181,7 @@ def _set_no_thousand_columns(self) -> set[int]: if isinstance(self.dtype, dict) and not is_numeric_dtype( self.dtype.get(self.columns[i], None) ): - for key in self.dtype: - if key == self.columns[i]: - no_thousands_columns.add(i) + no_thousands_columns.add(i) return no_thousands_columns diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index f570ab6c9a08d..27b0e7ccaac74 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -549,7 +549,7 @@ def test_no_thousand_with_dot_convert_for_non_numeric_cols(python_parser_only): result = parser.read_csv( StringIO(data), sep=";", - dtype={"a": str}, + dtype={"a": str, "b": np.int64, "c": np.int64}, thousands=".", ) expected = DataFrame( From f81116145887080413462a3b40f4adb3b970e8ae Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Fri, 17 Feb 2023 20:03:13 +0100 Subject: [PATCH 14/18] add parametrize to thousand separator test --- pandas/io/parsers/python_parser.py | 6 ++++-- pandas/tests/io/parser/test_python_parser_only.py | 7 +++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index aaaa9ffadb8a8..c53c49e936774 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1178,8 +1178,10 @@ def _set_no_thousand_columns(self) -> set[int]: self.dtype ): no_thousands_columns.add(i) - if isinstance(self.dtype, dict) and not is_numeric_dtype( - self.dtype.get(self.columns[i], None) + if ( + isinstance(self.dtype, dict) + and self.columns[i] in self.dtype + and not is_numeric_dtype(self.dtype[self.columns[i]]) ): no_thousands_columns.add(i) return no_thousands_columns diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 27b0e7ccaac74..0a13869f59ed9 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -538,7 +538,10 @@ def test_no_thousand_convert_for_non_numeric_cols(python_parser_only): tm.assert_frame_equal(result3, expected3) -def test_no_thousand_with_dot_convert_for_non_numeric_cols(python_parser_only): +@pytest.mark.parametrize( + "dtype", [{"a": str}, {"a": str, "b": np.int64, "c": np.int64}] +) +def test_no_thousand_with_dot_convert_for_non_numeric_cols(python_parser_only, dtype): # GH#50270 parser = python_parser_only data = """a;b;c @@ -549,7 +552,7 @@ def test_no_thousand_with_dot_convert_for_non_numeric_cols(python_parser_only): result = parser.read_csv( StringIO(data), sep=";", - dtype={"a": str, "b": np.int64, "c": np.int64}, + dtype=dtype, thousands=".", ) expected = DataFrame( From f008fabab3dc661a84ad09c3b36152e5cdf783ba Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Fri, 24 Feb 2023 17:19:02 +0100 Subject: [PATCH 15/18] BUG: remove duplicative test --- .../io/parser/common/test_common_basic.py | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index d6926a1f76ca6..6656face3be84 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -127,30 +127,6 @@ def test_1000_sep(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow -def test__search_replace_num_columns(all_parsers): - parser = all_parsers - data = """A;B;C -155.75;3.001;43.06.4 -16.2;0;23.0 -""" - expected = DataFrame( - { - "A": ["155.75", "16.2"], - "B": [3001, 0], - "C": [43064, 230], - } - ) - - result = parser.read_csv( - StringIO(data), - sep=";", - dtype={"A": str, "B": np.int64, "C": np.int64}, - thousands=".", - ) - tm.assert_frame_equal(result, expected) - - @xfail_pyarrow def test_unnamed_columns(all_parsers): data = """A,B,C,, From 6a329b12b60d7469e2ae7a1f74c0f0688221f670 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Sun, 26 Feb 2023 20:57:17 +0100 Subject: [PATCH 16/18] BUG: add additional parameters in parametrization --- .../io/parser/test_python_parser_only.py | 131 ++++++++++-------- 1 file changed, 70 insertions(+), 61 deletions(-) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 0a13869f59ed9..9a9033440a519 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -491,75 +491,84 @@ def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parse tm.assert_frame_equal(result, expected) -def test_no_thousand_convert_for_non_numeric_cols(python_parser_only): - # GH#50270 - parser = python_parser_only - data = """a;b;c +@pytest.mark.parametrize( + "data,dtype,thousands,expected", + [ + ( + """\ +a;b;c +0000.7995;16.000;0 +3.03.001.00514;0;4.000 +4923.600.041;23.000;131""", + {"a": str}, + ".", + DataFrame( + { + "a": ["0000.7995", "3.03.001.00514", "4923.600.041"], + "b": [16000, 0, 23000], + "c": [0, 4000, 131], + } + ), + ), + ( + """\ +a;b;c +0000.7995;16.000;0 +3.03.001.00514;0;4.000 +4923.600.041;23.000;131""", + {"a": str, "b": np.int64, "c": np.int64}, + ".", + DataFrame( + { + "a": ["0000.7995", "3.03.001.00514", "4923.600.041"], + "b": [16000, 0, 23000], + "c": [0, 4000, 131], + } + ), + ), + ( + """\ +a;b;c 0000,7995;16,000.1;0 3,03,001,00514;0;4,001 -4923,600,041;23,000;131 -""" - result = parser.read_csv( - StringIO(data), - sep=";", - dtype={"a": str, "b": float, "c": np.int64}, - thousands=",", - ) - expected = DataFrame( - { - "a": ["0000,7995", "3,03,001,00514", "4923,600,041"], - "b": [16000.1, 0, 23000], - "c": [0, 4001, 131], - } - ) - tm.assert_frame_equal(result, expected) - - result2 = parser.read_csv( - StringIO(data), sep=";", dtype={"a": str, "b": str, "c": str}, thousands="," - ) - expected2 = DataFrame( - { - "a": ["0000,7995", "3,03,001,00514", "4923,600,041"], - "b": ["16,000.1", "0", "23,000"], - "c": ["0", "4,001", "131"], - } - ) - tm.assert_frame_equal(result2, expected2) - - result3 = parser.read_csv(StringIO(data), sep=";", dtype=float, thousands=",") - expected3 = DataFrame( - { - "a": [7995, 30300100514, 4923600041], - "b": [16000.1, 0, 23000], - "c": [0, 4001, 131], - }, - dtype=float, - ) - tm.assert_frame_equal(result3, expected3) - - -@pytest.mark.parametrize( - "dtype", [{"a": str}, {"a": str, "b": np.int64, "c": np.int64}] +4923,600,041;23,000;131""", + {"a": str, "b": np.float64, "c": np.int64}, + ",", + DataFrame( + { + "a": ["0000,7995", "3,03,001,00514", "4923,600,041"], + "b": [16000.1, 0, 23000], + "c": [0, 4001, 131], + } + ), + ), + ( + """\ +a;b;c +0000,7995;16,000.1;0 +3,03,001,00514;0;4,001 +4923,600,041;23,000;131""", + str, + ",", + DataFrame( + { + "a": ["0000,7995", "3,03,001,00514", "4923,600,041"], + "b": ["16,000.1", "0", "23,000"], + "c": ["0", "4,001", "131"], + } + ), + ), + ], ) -def test_no_thousand_with_dot_convert_for_non_numeric_cols(python_parser_only, dtype): +def test_no_thousand_convert_for_non_numeric_cols( + python_parser_only, data, dtype, thousands, expected +): # GH#50270 parser = python_parser_only - data = """a;b;c -0000.7995;16.000;0 -3.03.001.00514;0;4.000 -4923.600.041;23.000;131 -""" result = parser.read_csv( StringIO(data), sep=";", dtype=dtype, - thousands=".", - ) - expected = DataFrame( - { - "a": ["0000.7995", "3.03.001.00514", "4923.600.041"], - "b": [16000, 0, 23000], - "c": [0, 4000, 131], - } + thousands=thousands, ) tm.assert_frame_equal(result, expected) From a31f335b16b99c58f292298de7e246cf01871207 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Wed, 1 Mar 2023 15:36:37 +0100 Subject: [PATCH 17/18] BUG: exclude bool and add object dtype in parametrization --- pandas/io/parsers/python_parser.py | 6 +++++- pandas/tests/io/parser/test_python_parser_only.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index bf87f465d23ae..e2e363be40710 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -35,6 +35,7 @@ ) from pandas.core.dtypes.common import ( + is_bool_dtype, is_integer, is_numeric_dtype, ) @@ -1177,7 +1178,10 @@ def _set_no_thousand_columns(self) -> set[int]: if ( isinstance(self.dtype, dict) and self.columns[i] in self.dtype - and not is_numeric_dtype(self.dtype[self.columns[i]]) + and ( + not is_numeric_dtype(self.dtype[self.columns[i]]) + or is_bool_dtype(self.dtype[self.columns[i]]) + ) ): no_thousands_columns.add(i) return no_thousands_columns diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 9a9033440a519..17354e9685405 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -500,7 +500,7 @@ def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parse 0000.7995;16.000;0 3.03.001.00514;0;4.000 4923.600.041;23.000;131""", - {"a": str}, + {"a": object}, ".", DataFrame( { From 264346a0f6563a2f9ccade3420cb6f54246b9516 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Thu, 2 Mar 2023 16:43:30 +0100 Subject: [PATCH 18/18] BUG: change parameters of parametrization and add a second test --- .../io/parser/test_python_parser_only.py | 83 ++++++++----------- 1 file changed, 34 insertions(+), 49 deletions(-) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 17354e9685405..b22953fedd6af 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -492,67 +492,48 @@ def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parse @pytest.mark.parametrize( - "data,dtype,thousands,expected", - [ - ( - """\ -a;b;c -0000.7995;16.000;0 -3.03.001.00514;0;4.000 -4923.600.041;23.000;131""", - {"a": object}, - ".", - DataFrame( - { - "a": ["0000.7995", "3.03.001.00514", "4923.600.041"], - "b": [16000, 0, 23000], - "c": [0, 4000, 131], - } - ), - ), - ( - """\ + "dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}] +) +def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, dtype): + # GH#50270 + parser = python_parser_only + data = """\ a;b;c 0000.7995;16.000;0 3.03.001.00514;0;4.000 -4923.600.041;23.000;131""", - {"a": str, "b": np.int64, "c": np.int64}, - ".", - DataFrame( - { - "a": ["0000.7995", "3.03.001.00514", "4923.600.041"], - "b": [16000, 0, 23000], - "c": [0, 4000, 131], - } - ), - ), +4923.600.041;23.000;131""" + result = parser.read_csv( + StringIO(data), + sep=";", + dtype=dtype, + thousands=".", + ) + expected = DataFrame( + { + "a": ["0000.7995", "3.03.001.00514", "4923.600.041"], + "b": [16000, 0, 23000], + "c": [0, 4000, 131], + } + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype,expected", + [ ( - """\ -a;b;c -0000,7995;16,000.1;0 -3,03,001,00514;0;4,001 -4923,600,041;23,000;131""", {"a": str, "b": np.float64, "c": np.int64}, - ",", DataFrame( { - "a": ["0000,7995", "3,03,001,00514", "4923,600,041"], "b": [16000.1, 0, 23000], "c": [0, 4001, 131], } ), ), ( - """\ -a;b;c -0000,7995;16,000.1;0 -3,03,001,00514;0;4,001 -4923,600,041;23,000;131""", str, - ",", DataFrame( { - "a": ["0000,7995", "3,03,001,00514", "4923,600,041"], "b": ["16,000.1", "0", "23,000"], "c": ["0", "4,001", "131"], } @@ -560,15 +541,19 @@ def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parse ), ], ) -def test_no_thousand_convert_for_non_numeric_cols( - python_parser_only, data, dtype, thousands, expected -): +def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, expected): # GH#50270 parser = python_parser_only + data = """a;b;c +0000,7995;16,000.1;0 +3,03,001,00514;0;4,001 +4923,600,041;23,000;131 +""" result = parser.read_csv( StringIO(data), sep=";", dtype=dtype, - thousands=thousands, + thousands=",", ) + expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"]) tm.assert_frame_equal(result, expected)