Skip to content

BUG: str dtype ignored for column with dot #50364

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Mar 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
2f40db3
BUG: str dtype ignored for column with dot I
natmokval Dec 20, 2022
b032e18
BUG: add test to str dtype ignored for column with dot I
natmokval Dec 22, 2022
ea1865d
BUG: str dtype ignored for column with dot III
natmokval Dec 22, 2022
1554f75
BUG: str dtype ignored for column with dot IV
natmokval Dec 22, 2022
1e09dfb
BUG: str dtype ignored for column with dot V
natmokval Jan 2, 2023
615a722
BUG: str dtype ignored for column with dot VI
natmokval Jan 5, 2023
e9767f5
Merge branch 'main' into 50270-str-ignore-dot
natmokval Jan 29, 2023
4ac9b77
TEST: added assert for mypy
natmokval Jan 29, 2023
3bf5e1b
TEST: added assert for mypy II
natmokval Jan 29, 2023
8ea7294
BUG: str dtype ignored for column with dot VII
natmokval Feb 4, 2023
d47c8b0
Merge branch 'main' into 50270-str-ignore-dot
natmokval Feb 5, 2023
6e5c5fb
specify int64 explicitly
natmokval Feb 11, 2023
5565c30
Merge branch 'main' into 50270-str-ignore-dot
natmokval Feb 11, 2023
f40184c
specify int64 explicitly II
natmokval Feb 11, 2023
8fd2e69
add the original example and remove the redundant check
natmokval Feb 13, 2023
c71e5b0
remove unnecessary check
natmokval Feb 13, 2023
f811161
add parametrize to thousand separator test
natmokval Feb 17, 2023
5341153
Merge branch 'main' into 50270-str-ignore-dot
natmokval Feb 17, 2023
9b8767d
Merge branch 'main' into 50270-str-ignore-dot
natmokval Feb 17, 2023
c701e04
Merge branch 'main' into 50270-str-ignore-dot
natmokval Feb 18, 2023
f008fab
BUG: remove duplicative test
natmokval Feb 24, 2023
683d208
Merge branch 'main' into 50270-str-ignore-dot
natmokval Feb 24, 2023
6a329b1
BUG: add additional parameters in parametrization
natmokval Feb 26, 2023
a31f335
BUG: exclude bool and add object dtype in parametrization
natmokval Mar 1, 2023
264346a
BUG: change parameters of parametrization and add a second test
natmokval Mar 2, 2023
38b1b15
Merge branch 'main' into 50270-str-ignore-dot
natmokval Mar 2, 2023
6d9ef48
Merge branch 'main' into 50270-str-ignore-dot
natmokval Mar 11, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 32 additions & 8 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,11 @@
ParserError,
)

from pandas.core.dtypes.common import is_integer
from pandas.core.dtypes.common import (
is_bool_dtype,
is_integer,
is_numeric_dtype,
)
from pandas.core.dtypes.inference import is_dict_like

from pandas.io.common import (
Expand Down Expand Up @@ -155,12 +159,7 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
self._col_indices = list(range(len(self.columns)))

self._parse_date_cols = self._validate_parse_dates_presence(self.columns)
no_thousands_columns: set[int] | None = None
if self.parse_dates:
no_thousands_columns = self._set_noconvert_dtype_columns(
self._col_indices, self.columns
)
self._no_thousands_columns = no_thousands_columns
self._no_thousands_columns = self._set_no_thousand_columns()

if len(self.decimal) != 1:
raise ValueError("Only length-1 decimal markers supported")
Expand Down Expand Up @@ -889,7 +888,7 @@ def _search_replace_num_columns(
if (
not isinstance(x, str)
or search not in x
or (self._no_thousands_columns and i in self._no_thousands_columns)
or i in self._no_thousands_columns
or not self.num.search(x.strip())
):
rl.append(x)
Expand Down Expand Up @@ -1162,6 +1161,31 @@ def _remove_skipped_rows(self, new_rows: list[list[Scalar]]) -> list[list[Scalar
]
return new_rows

def _set_no_thousand_columns(self) -> set[int]:
no_thousands_columns: set[int] = set()
if self.columns and self.parse_dates:
assert self._col_indices is not None
no_thousands_columns = self._set_noconvert_dtype_columns(
self._col_indices, self.columns
)
if self.columns and self.dtype:
assert self._col_indices is not None
for i in self._col_indices:
if not isinstance(self.dtype, dict) and not is_numeric_dtype(
self.dtype
):
no_thousands_columns.add(i)
if (
isinstance(self.dtype, dict)
and self.columns[i] in self.dtype
and (
not is_numeric_dtype(self.dtype[self.columns[i]])
or is_bool_dtype(self.dtype[self.columns[i]])
)
):
no_thousands_columns.add(i)
return no_thousands_columns


class FixedWidthReader(abc.Iterator):
"""
Expand Down
69 changes: 69 additions & 0 deletions pandas/tests/io/parser/test_python_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
)
from typing import Iterator

import numpy as np
import pytest

from pandas.errors import (
Expand Down Expand Up @@ -488,3 +489,71 @@ def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parse
)
expected = DataFrame({"a": ["a", "c", "f"]})
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}]
)
def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, dtype):
# GH#50270
parser = python_parser_only
data = """\
a;b;c
0000.7995;16.000;0
3.03.001.00514;0;4.000
4923.600.041;23.000;131"""
result = parser.read_csv(
StringIO(data),
sep=";",
dtype=dtype,
thousands=".",
)
expected = DataFrame(
{
"a": ["0000.7995", "3.03.001.00514", "4923.600.041"],
"b": [16000, 0, 23000],
"c": [0, 4000, 131],
}
)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"dtype,expected",
[
(
{"a": str, "b": np.float64, "c": np.int64},
DataFrame(
{
"b": [16000.1, 0, 23000],
"c": [0, 4001, 131],
}
),
),
(
str,
DataFrame(
{
"b": ["16,000.1", "0", "23,000"],
"c": ["0", "4,001", "131"],
}
),
),
],
)
def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, expected):
# GH#50270
parser = python_parser_only
data = """a;b;c
0000,7995;16,000.1;0
3,03,001,00514;0;4,001
4923,600,041;23,000;131
"""
result = parser.read_csv(
StringIO(data),
sep=";",
dtype=dtype,
thousands=",",
)
expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"])
tm.assert_frame_equal(result, expected)