diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 315d18d052d9f..4b887a904400d 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -29,7 +29,11 @@ ParserError, ) -from pandas.core.dtypes.common import is_integer +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_integer, + is_numeric_dtype, +) from pandas.core.dtypes.inference import is_dict_like from pandas.io.common import ( @@ -155,12 +159,7 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: self._col_indices = list(range(len(self.columns))) self._parse_date_cols = self._validate_parse_dates_presence(self.columns) - no_thousands_columns: set[int] | None = None - if self.parse_dates: - no_thousands_columns = self._set_noconvert_dtype_columns( - self._col_indices, self.columns - ) - self._no_thousands_columns = no_thousands_columns + self._no_thousands_columns = self._set_no_thousand_columns() if len(self.decimal) != 1: raise ValueError("Only length-1 decimal markers supported") @@ -889,7 +888,7 @@ def _search_replace_num_columns( if ( not isinstance(x, str) or search not in x - or (self._no_thousands_columns and i in self._no_thousands_columns) + or i in self._no_thousands_columns or not self.num.search(x.strip()) ): rl.append(x) @@ -1162,6 +1161,31 @@ def _remove_skipped_rows(self, new_rows: list[list[Scalar]]) -> list[list[Scalar ] return new_rows + def _set_no_thousand_columns(self) -> set[int]: + no_thousands_columns: set[int] = set() + if self.columns and self.parse_dates: + assert self._col_indices is not None + no_thousands_columns = self._set_noconvert_dtype_columns( + self._col_indices, self.columns + ) + if self.columns and self.dtype: + assert self._col_indices is not None + for i in self._col_indices: + if not isinstance(self.dtype, dict) and not is_numeric_dtype( + self.dtype + ): + no_thousands_columns.add(i) + if ( + isinstance(self.dtype, dict) + and self.columns[i] in self.dtype + and ( + not is_numeric_dtype(self.dtype[self.columns[i]]) + or is_bool_dtype(self.dtype[self.columns[i]]) + ) + ): + no_thousands_columns.add(i) + return no_thousands_columns + class FixedWidthReader(abc.Iterator): """ diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index ca5a757328ba7..b22953fedd6af 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -14,6 +14,7 @@ ) from typing import Iterator +import numpy as np import pytest from pandas.errors import ( @@ -488,3 +489,71 @@ def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parse ) expected = DataFrame({"a": ["a", "c", "f"]}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}] +) +def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, dtype): + # GH#50270 + parser = python_parser_only + data = """\ +a;b;c +0000.7995;16.000;0 +3.03.001.00514;0;4.000 +4923.600.041;23.000;131""" + result = parser.read_csv( + StringIO(data), + sep=";", + dtype=dtype, + thousands=".", + ) + expected = DataFrame( + { + "a": ["0000.7995", "3.03.001.00514", "4923.600.041"], + "b": [16000, 0, 23000], + "c": [0, 4000, 131], + } + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype,expected", + [ + ( + {"a": str, "b": np.float64, "c": np.int64}, + DataFrame( + { + "b": [16000.1, 0, 23000], + "c": [0, 4001, 131], + } + ), + ), + ( + str, + DataFrame( + { + "b": ["16,000.1", "0", "23,000"], + "c": ["0", "4,001", "131"], + } + ), + ), + ], +) +def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, expected): + # GH#50270 + parser = python_parser_only + data = """a;b;c +0000,7995;16,000.1;0 +3,03,001,00514;0;4,001 +4923,600,041;23,000;131 +""" + result = parser.read_csv( + StringIO(data), + sep=";", + dtype=dtype, + thousands=",", + ) + expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"]) + tm.assert_frame_equal(result, expected)