From 8d0efca6467c422b105e705834823bc3813464c2 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Mon, 30 May 2022 09:58:23 +0200 Subject: [PATCH 01/28] TST: integer overflow on parsing with insufficient user dtype squash! TST: integer overflow on parsing with insufficient user dtype --- pandas/tests/io/parser/test_textreader.py | 28 +++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index f150ed3903443..46e0d7e22d218 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -11,9 +11,9 @@ import pytest import pandas._libs.parsers as parser -from pandas._libs.parsers import TextReader +from pandas._libs.parsers import TextReader, is_extension_array_dtype -from pandas import DataFrame +from pandas import DataFrame, array import pandas._testing as tm from pandas.io.parsers import ( @@ -125,6 +125,30 @@ def test_integer_thousands_alt(self): expected = DataFrame([123456, 12500]) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "dtype", [ + "uint64", "int64", "uint32", "int32", "uint16", "int16", "uint8", "int8", + "UInt64", "Int64", "UInt32", "Int32", "UInt16", "Int16", "UInt8", "Int8" + ] + ) + def test_integer_overflow_with_user_dtype(self, dtype): + dtype = ensure_dtype_objs(dtype) + is_ext_dtype = is_extension_array_dtype(dtype) + maxint = np.iinfo(dtype.type if is_ext_dtype else dtype).max + + reader = TextReader(StringIO(f"{maxint}"), header=None, dtype=dtype) + result = reader.read() + if is_ext_dtype: + expected = array([maxint], dtype=dtype) + tm.assert_extension_array_equal(result[0], expected) + else: + expected = np.array([maxint], dtype=dtype) + tm.assert_numpy_array_equal(result[0], expected) + + reader = TextReader(StringIO(f"{maxint + 1}"), header=None, dtype=dtype) + with pytest.raises(Exception): + reader.read() + def test_skip_bad_lines(self, capsys): # too many lines, see #2430 for why data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r" From 27629f0afcf6d22a48fcb8452ad17d0545ac75c7 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Mon, 30 May 2022 10:01:18 +0200 Subject: [PATCH 02/28] BUG: raise on integer overflow when parsing with insufficient user dtype --- pandas/_libs/parsers.pyx | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index b07fa143c98b6..1dfdb171f2b83 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1182,13 +1182,21 @@ cdef class TextReader: if user_dtype and na_count is not None: if na_count > 0: raise ValueError(f"Integer column has NA values in column {i}") - except OverflowError: + except OverflowError as err: + if user_dtype and dtype == 'int64': + raise err result = _try_uint64(self.parser, i, start, end, na_filter, na_hashset) na_count = 0 if result is not None and dtype != 'int64': - result = result.astype(dtype) + casted = result.astype(dtype) + if (casted == result).all(): + result = casted + else: + raise TypeError( + f"cannot safely cast non-equivalent {result.dtype} to {dtype}" + ) return result, na_count From bfb0b8979a7406d69ddec8cb69fa8f691cd80b57 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Mon, 30 May 2022 09:49:59 +0000 Subject: [PATCH 03/28] Fixes from pre-commit [automated commit] --- pandas/tests/io/parser/test_textreader.py | 33 ++++++++++++++++++----- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 46e0d7e22d218..ba90d0b9524e3 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -11,9 +11,15 @@ import pytest import pandas._libs.parsers as parser -from pandas._libs.parsers import TextReader, is_extension_array_dtype +from pandas._libs.parsers import ( + TextReader, + is_extension_array_dtype, +) -from pandas import DataFrame, array +from pandas import ( + DataFrame, + array, +) import pandas._testing as tm from pandas.io.parsers import ( @@ -126,10 +132,25 @@ def test_integer_thousands_alt(self): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( - "dtype", [ - "uint64", "int64", "uint32", "int32", "uint16", "int16", "uint8", "int8", - "UInt64", "Int64", "UInt32", "Int32", "UInt16", "Int16", "UInt8", "Int8" - ] + "dtype", + [ + "uint64", + "int64", + "uint32", + "int32", + "uint16", + "int16", + "uint8", + "int8", + "UInt64", + "Int64", + "UInt32", + "Int32", + "UInt16", + "Int16", + "UInt8", + "Int8", + ], ) def test_integer_overflow_with_user_dtype(self, dtype): dtype = ensure_dtype_objs(dtype) From 661c853509bd5d290f2ffad98ce1b566c7d1b08e Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Mon, 30 May 2022 12:52:43 +0000 Subject: [PATCH 04/28] DOC: added entry in whatsnew --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index eb08034bb92eb..bbc41eeb14717 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -814,7 +814,7 @@ I/O - Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements (:issue:`45598`) - Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`) - :meth:`to_html` now excludes the ``border`` attribute from ```` elements when ``border`` keyword is set to ``False``. -- +- Bug in :meth:`TextReader.read` with specified (non-extension) integer ``dtype`` can cause silent overflow or unexpected return dtype (:issue:`47167`) Period ^^^^^^ From ccb6f6144610000ac383922c341682e75a17a897 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Mon, 30 May 2022 13:12:15 +0000 Subject: [PATCH 05/28] Introduce emtpy match in pytest.raises for flake8 --- pandas/tests/io/parser/test_textreader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index ba90d0b9524e3..52a7ec680fe36 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -167,7 +167,7 @@ def test_integer_overflow_with_user_dtype(self, dtype): tm.assert_numpy_array_equal(result[0], expected) reader = TextReader(StringIO(f"{maxint + 1}"), header=None, dtype=dtype) - with pytest.raises(Exception): + with pytest.raises(Exception, match=""): reader.read() def test_skip_bad_lines(self, capsys): From a3b458a4b92ccfdb050391f3a3b0e14bfc43e500 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Mon, 30 May 2022 13:47:04 +0000 Subject: [PATCH 06/28] Changed import location of is_extension_array_dtype for type check --- pandas/tests/io/parser/test_textreader.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 52a7ec680fe36..fc3086fb3e91c 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -11,16 +11,14 @@ import pytest import pandas._libs.parsers as parser -from pandas._libs.parsers import ( - TextReader, - is_extension_array_dtype, -) +from pandas._libs.parsers import TextReader from pandas import ( DataFrame, array, ) import pandas._testing as tm +from pandas.api.types import is_extension_array_dtype from pandas.io.parsers import ( TextFileReader, From 994a6345bdefb58584d02cef23bac00215fa98a7 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Tue, 31 May 2022 14:52:53 +0000 Subject: [PATCH 07/28] PERF: avoid try parse as int64 if user specified uint64 --- pandas/_libs/parsers.pyx | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 1dfdb171f2b83..cecefbf8dbf59 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1176,20 +1176,27 @@ cdef class TextReader: return result, na_count elif is_integer_dtype(dtype): - try: - result, na_count = _try_int64(self.parser, i, start, - end, na_filter, na_hashset) - if user_dtype and na_count is not None: - if na_count > 0: - raise ValueError(f"Integer column has NA values in column {i}") - except OverflowError as err: - if user_dtype and dtype == 'int64': - raise err + do_try_uint64 = False + if user_dtype and dtype == 'uint64': + do_try_uint64 = True + else: + try: + result, na_count = _try_int64(self.parser, i, start, + end, na_filter, na_hashset) + if user_dtype and na_count is not None: + if na_count > 0: + raise ValueError(f"Integer column has NA values in column {i}") + except OverflowError as err: + if user_dtype and dtype == 'int64': + raise err + do_try_uint64 = True + + if do_try_uint64: result = _try_uint64(self.parser, i, start, end, na_filter, na_hashset) na_count = 0 - if result is not None and dtype != 'int64': + if result is not None and dtype not in ('int64', 'uint64'): casted = result.astype(dtype) if (casted == result).all(): result = casted From 43bcb220bad250125aaa5ea957face5bea8d0927 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Tue, 31 May 2022 23:32:58 +0000 Subject: [PATCH 08/28] TST: simple asv for uint8 parsing --- asv_bench/benchmarks/io/csv.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 10aef954a3475..6da4f16160660 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -285,6 +285,15 @@ def time_read_uint64_na_values(self): ) +class ReadUint8Integers(StringIORewind): + def setup(self): + arr = np.arange(10000).astype("uint8") + self.data1 = StringIO("\n".join(arr.astype(str).tolist())) + + def time_read_uint8(self): + read_csv(self.data(self.data1), header=None, names=["foo"], dtype="uint8") + + class ReadCSVThousands(BaseIO): fname = "__test__.csv" From 61a36a5cecc7d1fc0aefc11326f33c9e6e496ac2 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Tue, 31 May 2022 23:36:26 +0000 Subject: [PATCH 09/28] BUG: stringio rewind in asv ReadCSVIndexCol --- asv_bench/benchmarks/io/csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 6da4f16160660..d7ad6f3a4dc42 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -576,7 +576,7 @@ def setup(self): self.StringIO_input = StringIO(data) def time_read_csv_index_col(self): - read_csv(self.StringIO_input, index_col="a") + read_csv(self.data(self.StringIO_input), index_col="a") from ..pandas_vb_common import setup # noqa: F401 isort:skip From 927ddad322b59975255ac0c8f07ace0898441fd2 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Mon, 6 Jun 2022 21:47:27 +0000 Subject: [PATCH 10/28] CLN: simplified conditional logic for int parsing --- pandas/_libs/parsers.pyx | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index cecefbf8dbf59..88de27693577b 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1176,9 +1176,10 @@ cdef class TextReader: return result, na_count elif is_integer_dtype(dtype): - do_try_uint64 = False if user_dtype and dtype == 'uint64': - do_try_uint64 = True + result = _try_uint64(self.parser, i, start, + end, na_filter, na_hashset) + na_count = 0 else: try: result, na_count = _try_int64(self.parser, i, start, @@ -1189,12 +1190,9 @@ cdef class TextReader: except OverflowError as err: if user_dtype and dtype == 'int64': raise err - do_try_uint64 = True - - if do_try_uint64: - result = _try_uint64(self.parser, i, start, end, - na_filter, na_hashset) - na_count = 0 + result = _try_uint64(self.parser, i, start, + end, na_filter, na_hashset) + na_count = 0 if result is not None and dtype not in ('int64', 'uint64'): casted = result.astype(dtype) From 498b93df15d98a5eae4e8ddd29f3d681864b94f8 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Sat, 23 Jul 2022 21:19:03 +0000 Subject: [PATCH 11/28] TST: reduced repetition by using any_int_dtype in test --- pandas/tests/io/parser/test_textreader.py | 25 ++--------------------- 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index fc3086fb3e91c..9fe03e59090ec 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -129,29 +129,8 @@ def test_integer_thousands_alt(self): expected = DataFrame([123456, 12500]) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "dtype", - [ - "uint64", - "int64", - "uint32", - "int32", - "uint16", - "int16", - "uint8", - "int8", - "UInt64", - "Int64", - "UInt32", - "Int32", - "UInt16", - "Int16", - "UInt8", - "Int8", - ], - ) - def test_integer_overflow_with_user_dtype(self, dtype): - dtype = ensure_dtype_objs(dtype) + def test_integer_overflow_with_user_dtype(self, any_int_dtype): + dtype = ensure_dtype_objs(any_int_dtype) is_ext_dtype = is_extension_array_dtype(dtype) maxint = np.iinfo(dtype.type if is_ext_dtype else dtype).max From 9e1fbbc2dcf08a14e3480e30a0d80f83e63dda71 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Sat, 23 Jul 2022 21:20:45 +0000 Subject: [PATCH 12/28] TST: added tests for read_csv with both engines c and python --- pandas/tests/io/parser/common/test_ints.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py index e3159ef3e6a42..7c4da53b223f8 100644 --- a/pandas/tests/io/parser/common/test_ints.py +++ b/pandas/tests/io/parser/common/test_ints.py @@ -12,6 +12,10 @@ Series, ) import pandas._testing as tm +from pandas.api.types import ( + is_extension_array_dtype, + pandas_dtype, +) # GH#43650: Some expected failures with the pyarrow engine can occasionally # cause a deadlock instead, so we skip these instead of xfailing @@ -110,6 +114,24 @@ def test_integer_overflow_bug(all_parsers, sep): tm.assert_frame_equal(result, expected) +@skip_pyarrow +def test_integer_overflow_with_user_dtype(all_parsers, any_int_dtype): + dtype = any_int_dtype + parser = all_parsers + + pdtype = pandas_dtype(any_int_dtype) + iinfo = np.iinfo(pdtype.type if is_extension_array_dtype(dtype) else pdtype) + + for x in [iinfo.max, iinfo.min]: + result = parser.read_csv(StringIO(f"{x}"), header=None, dtype=dtype) + expected = DataFrame([x], dtype=dtype) + tm.assert_frame_equal(result, expected) + + for x in [iinfo.max + 1, iinfo.min - 1]: + with pytest.raises(Exception, match=""): + parser.read_csv(StringIO(f"{x}"), header=None, dtype=dtype) + + def test_int64_min_issues(all_parsers): # see gh-2599 parser = all_parsers From ef91ab56687d10bfe922d914866d776df4737032 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Sat, 23 Jul 2022 22:10:35 +0000 Subject: [PATCH 13/28] BUG: raise on integer overflow when parsing with insufficient user dtype --- pandas/io/parsers/base_parser.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 531fa5400f466..d1cf738193a65 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -785,11 +785,16 @@ def _cast_types(self, values, cast_type, column): else: try: - values = astype_nansafe(values, cast_type, copy=True, skipna=True) + casted = astype_nansafe(values, cast_type, copy=True, skipna=True) except ValueError as err: raise ValueError( f"Unable to convert column {column} to type {cast_type}" ) from err + if is_integer_dtype(cast_type) and not (casted == values).all(): + raise TypeError( + f"cannot safely cast non-equivalent {values.dtype} to {cast_type}" + ) + values = casted return values @overload From 270eb9031f703594ffc1a274248870ba6292d3b0 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Sun, 24 Jul 2022 20:43:10 +0000 Subject: [PATCH 14/28] TST: added/modified tests to raise on lossy float conversion due to specified int dtype --- pandas/tests/io/parser/common/test_ints.py | 17 +++++++++++++++++ pandas/tests/io/parser/test_read_fwf.py | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py index 7c4da53b223f8..0e26715f0e22e 100644 --- a/pandas/tests/io/parser/common/test_ints.py +++ b/pandas/tests/io/parser/common/test_ints.py @@ -2,6 +2,7 @@ Tests that work on both the Python and C engines but do not have a specific classification into the other test modules. """ +from contextlib import nullcontext from io import StringIO import numpy as np @@ -132,6 +133,22 @@ def test_integer_overflow_with_user_dtype(all_parsers, any_int_dtype): parser.read_csv(StringIO(f"{x}"), header=None, dtype=dtype) +@pytest.mark.parametrize( + "val,expected", + [ + (0.0, nullcontext()), # lossless conversion does not raise + (0.1, pytest.raises(Exception, match=None)), # noqa: PDF010 + ], +) +def test_integer_from_float_raises(all_parsers, any_int_dtype, val, expected): + dtype = any_int_dtype + parser = all_parsers + data = f"0\n{val}" + + with expected: + parser.read_csv(StringIO(data), header=None, dtype=dtype) + + def test_int64_min_issues(all_parsers): # see gh-2599 parser = all_parsers diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index d6d787df39dfa..33c776f9fbed9 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -556,7 +556,7 @@ def test_variable_width_unicode(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("dtype", [{}, {"a": "float64", "b": str, "c": "int32"}]) +@pytest.mark.parametrize("dtype", [{}, {"a": "float64", "b": str, "c": "float16"}]) def test_dtype(dtype): data = """ a b c 1 2 3.2 From dd5cd0e8350537ec35ff83f881a4f8dd76e4da2f Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Sun, 24 Jul 2022 20:45:55 +0000 Subject: [PATCH 15/28] DOC: minor correction in test docstring --- pandas/tests/io/parser/common/test_ints.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py index 0e26715f0e22e..838b4e69ebb9f 100644 --- a/pandas/tests/io/parser/common/test_ints.py +++ b/pandas/tests/io/parser/common/test_ints.py @@ -209,7 +209,7 @@ def test_int64_overflow(all_parsers, conv): ) def test_int64_uint64_range(all_parsers, val): # These numbers fall right inside the int64-uint64 - # range, so they should be parsed as string. + # range, so they should be parsed as integer value. parser = all_parsers result = parser.read_csv(StringIO(str(val)), header=None) From 64047b428ef4086cfbaa7711ffa082de9903dea5 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Sun, 24 Jul 2022 21:22:23 +0000 Subject: [PATCH 16/28] DOC: explained changes in whatsnew in terms of public api --- doc/source/whatsnew/v1.5.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 7f81cb6fc520d..f1385a4b1e780 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -958,7 +958,8 @@ I/O - Bug in :func:`read_sas` with RLE-compressed SAS7BDAT files that contain 0x00 control bytes (:issue:`47099`) - Bug in :func:`read_parquet` with ``use_nullable_dtypes=True`` where ``float64`` dtype was returned instead of nullable ``Float64`` dtype (:issue:`45694`) - Bug in :meth:`DataFrame.to_json` where ``PeriodDtype`` would not make the serialization roundtrip when read back with :meth:`read_json` (:issue:`44720`) -- Bug in :meth:`TextReader.read` with specified (non-extension) integer ``dtype`` can cause silent overflow or unexpected return dtype (:issue:`47167`) +- Bug in :func:`read_csv` with specified (non-extension) integer ``dtype`` can cause silent overflow or unexpected return dtype (:issue:`47167`) +- Bug in :func:`read_csv` with specified (non-extension) integer ``dtype`` and ``engine="python"`` can cause silent lossy float coercion (:issue:`47167`) Period ^^^^^^ From d812c3222b15a4b4aec5fcd842009bfffae011e8 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Mon, 25 Jul 2022 07:20:26 +0000 Subject: [PATCH 17/28] TST: added missing skip_pyarrow mark --- pandas/tests/io/parser/common/test_ints.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py index 838b4e69ebb9f..856ee15db4a25 100644 --- a/pandas/tests/io/parser/common/test_ints.py +++ b/pandas/tests/io/parser/common/test_ints.py @@ -133,6 +133,7 @@ def test_integer_overflow_with_user_dtype(all_parsers, any_int_dtype): parser.read_csv(StringIO(f"{x}"), header=None, dtype=dtype) +@skip_pyarrow @pytest.mark.parametrize( "val,expected", [ From b1f83b9d2159f73f74add321259a965dc3c101e2 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Mon, 25 Jul 2022 20:01:03 +0000 Subject: [PATCH 18/28] TST: specified exceptions in pytest.raises --- pandas/tests/io/parser/common/test_ints.py | 6 +++--- pandas/tests/io/parser/test_textreader.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py index 856ee15db4a25..88f5ee3ddd9a8 100644 --- a/pandas/tests/io/parser/common/test_ints.py +++ b/pandas/tests/io/parser/common/test_ints.py @@ -120,7 +120,7 @@ def test_integer_overflow_with_user_dtype(all_parsers, any_int_dtype): dtype = any_int_dtype parser = all_parsers - pdtype = pandas_dtype(any_int_dtype) + pdtype = pandas_dtype(dtype) iinfo = np.iinfo(pdtype.type if is_extension_array_dtype(dtype) else pdtype) for x in [iinfo.max, iinfo.min]: @@ -129,7 +129,7 @@ def test_integer_overflow_with_user_dtype(all_parsers, any_int_dtype): tm.assert_frame_equal(result, expected) for x in [iinfo.max + 1, iinfo.min - 1]: - with pytest.raises(Exception, match=""): + with pytest.raises((OverflowError, TypeError, ValueError), match=None): parser.read_csv(StringIO(f"{x}"), header=None, dtype=dtype) @@ -138,7 +138,7 @@ def test_integer_overflow_with_user_dtype(all_parsers, any_int_dtype): "val,expected", [ (0.0, nullcontext()), # lossless conversion does not raise - (0.1, pytest.raises(Exception, match=None)), # noqa: PDF010 + (0.1, pytest.raises((TypeError, ValueError), match=None)), # noqa: PDF010 ], ) def test_integer_from_float_raises(all_parsers, any_int_dtype, val, expected): diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 9fe03e59090ec..9dd49732aea3b 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -144,7 +144,7 @@ def test_integer_overflow_with_user_dtype(self, any_int_dtype): tm.assert_numpy_array_equal(result[0], expected) reader = TextReader(StringIO(f"{maxint + 1}"), header=None, dtype=dtype) - with pytest.raises(Exception, match=""): + with pytest.raises((OverflowError, TypeError, ValueError), match=None): reader.read() def test_skip_bad_lines(self, capsys): From 8f4c9471439812e1f3a70a6d4ca8d219e35422f9 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Mon, 25 Jul 2022 21:14:45 +0000 Subject: [PATCH 19/28] TST: replaced loop cases with parametrized tests --- pandas/tests/io/parser/common/test_ints.py | 47 +++++++++++++++------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py index 88f5ee3ddd9a8..0107fe8003f2c 100644 --- a/pandas/tests/io/parser/common/test_ints.py +++ b/pandas/tests/io/parser/common/test_ints.py @@ -115,22 +115,38 @@ def test_integer_overflow_bug(all_parsers, sep): tm.assert_frame_equal(result, expected) -@skip_pyarrow -def test_integer_overflow_with_user_dtype(all_parsers, any_int_dtype): - dtype = any_int_dtype - parser = all_parsers - +def _iinfo(dtype): pdtype = pandas_dtype(dtype) iinfo = np.iinfo(pdtype.type if is_extension_array_dtype(dtype) else pdtype) + return iinfo - for x in [iinfo.max, iinfo.min]: - result = parser.read_csv(StringIO(f"{x}"), header=None, dtype=dtype) - expected = DataFrame([x], dtype=dtype) - tm.assert_frame_equal(result, expected) - for x in [iinfo.max + 1, iinfo.min - 1]: - with pytest.raises((OverflowError, TypeError, ValueError), match=None): - parser.read_csv(StringIO(f"{x}"), header=None, dtype=dtype) +_raises_int_overflow = pytest.raises( # noqa: PDF010 + (OverflowError, TypeError, ValueError), match=None +) + + +@skip_pyarrow +@pytest.mark.parametrize( + "getval,expected", + [ + (lambda dtype: _iinfo(dtype).max, nullcontext()), # in range does not raise + (lambda dtype: _iinfo(dtype).min, nullcontext()), # in range does not raise + (lambda dtype: _iinfo(dtype).max + 1, _raises_int_overflow), + (lambda dtype: _iinfo(dtype).min - 1, _raises_int_overflow), + ], +) +def test_integer_overflow_with_user_dtype(all_parsers, any_int_dtype, getval, expected): + dtype = any_int_dtype + parser = all_parsers + val = getval(dtype) + data = f"A\n{val}" + + with expected: + result = parser.read_csv(StringIO(data), dtype=dtype) + if "result" in locals(): + expected_result = DataFrame({"A": [val]}, dtype=dtype) + tm.assert_frame_equal(result, expected_result) @skip_pyarrow @@ -144,10 +160,13 @@ def test_integer_overflow_with_user_dtype(all_parsers, any_int_dtype): def test_integer_from_float_raises(all_parsers, any_int_dtype, val, expected): dtype = any_int_dtype parser = all_parsers - data = f"0\n{val}" + data = f"A\n0\n{val}" with expected: - parser.read_csv(StringIO(data), header=None, dtype=dtype) + result = parser.read_csv(StringIO(data), dtype=dtype) + if "result" in locals(): + expected_result = DataFrame({"A": [0, val]}, dtype=dtype) + tm.assert_frame_equal(result, expected_result) def test_int64_min_issues(all_parsers): From 520cae348251d387165203c4e59215996fc869b6 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Tue, 23 Aug 2022 20:28:05 +0000 Subject: [PATCH 20/28] CLN: moved na-check into else branch --- pandas/_libs/parsers.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index a7218fa5d3d5f..7d2bf97db85a9 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1187,15 +1187,15 @@ cdef class TextReader: try: result, na_count = _try_int64(self.parser, i, start, end, na_filter, na_hashset) - if user_dtype and na_count is not None: - if na_count > 0: - raise ValueError(f"Integer column has NA values in column {i}") except OverflowError as err: if user_dtype and dtype == 'int64': raise err result = _try_uint64(self.parser, i, start, end, na_filter, na_hashset) na_count = 0 + else: + if user_dtype and (na_count is not None) and (na_count > 0): + raise ValueError(f"Integer column has NA values in column {i}") if result is not None and dtype not in ('int64', 'uint64'): casted = result.astype(dtype) From 88d86508d83907bafd4a44af26ecb8dfbe1ee469 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Thu, 25 Aug 2022 08:32:52 +0000 Subject: [PATCH 21/28] CLN: re-use maybe_cast_to_integer_array for checked cast in python parser --- pandas/io/parsers/base_parser.py | 13 ++++++------- pandas/tests/io/parser/common/test_ints.py | 17 +++++++++++++++++ 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index eeb4f8159f88f..fb821016a73e5 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -43,6 +43,7 @@ from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.astype import astype_nansafe +from pandas.core.dtypes.cast import maybe_cast_to_integer_array from pandas.core.dtypes.common import ( ensure_object, is_bool_dtype, @@ -789,16 +790,14 @@ def _cast_types(self, values, cast_type, column): else: try: - casted = astype_nansafe(values, cast_type, copy=True, skipna=True) - except ValueError as err: + if is_integer_dtype(cast_type): + values = maybe_cast_to_integer_array(values, cast_type, copy=True) + else: + values = astype_nansafe(values, cast_type, copy=True, skipna=True) + except (ValueError, OverflowError) as err: raise ValueError( f"Unable to convert column {column} to type {cast_type}" ) from err - if is_integer_dtype(cast_type) and not (casted == values).all(): - raise TypeError( - f"cannot safely cast non-equivalent {values.dtype} to {cast_type}" - ) - values = casted return values @overload diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py index 0107fe8003f2c..ed47b59d784a4 100644 --- a/pandas/tests/io/parser/common/test_ints.py +++ b/pandas/tests/io/parser/common/test_ints.py @@ -15,6 +15,7 @@ import pandas._testing as tm from pandas.api.types import ( is_extension_array_dtype, + is_unsigned_integer_dtype, pandas_dtype, ) @@ -142,6 +143,22 @@ def test_integer_overflow_with_user_dtype(all_parsers, any_int_dtype, getval, ex val = getval(dtype) data = f"A\n{val}" + # Positive value overflow with uint8, uint16, uint32 and any overflow with + # int8, int16, int32 only throw a FutureWarning until deprecation from #41734 + # becomes enforced. After enforcement, the following block must be deleted. + if ( + (expected is _raises_int_overflow) + and (parser.engine == "python") + and (not is_extension_array_dtype(dtype)) + and (dtype < np.dtype("int64")) + and not (is_unsigned_integer_dtype(dtype) and (val < 0)) + ): + expected = tm.assert_produces_warning( + FutureWarning, + match=f"Values are too large to be losslessly cast to {dtype}.", + check_stacklevel=False, + ) + with expected: result = parser.read_csv(StringIO(data), dtype=dtype) if "result" in locals(): From d96d6b08727175446e602a6fb1a82462b932a148 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Thu, 25 Aug 2022 09:37:59 +0000 Subject: [PATCH 22/28] TST: specified expected exception --- pandas/tests/io/parser/common/test_ints.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py index ed47b59d784a4..8cc081a498be6 100644 --- a/pandas/tests/io/parser/common/test_ints.py +++ b/pandas/tests/io/parser/common/test_ints.py @@ -122,8 +122,15 @@ def _iinfo(dtype): return iinfo -_raises_int_overflow = pytest.raises( # noqa: PDF010 - (OverflowError, TypeError, ValueError), match=None +_raises_any_integer_cast_exception = pytest.raises( # noqa: PDF010 + (OverflowError, TypeError, ValueError), + match=( + "(Overflow)|" + "(Python int too large to convert to C long)|" + "(cannot safely cast non-equivalent)|" + "(Integer out of range)|" + "(Unable to convert column)" + ), ) @@ -133,8 +140,8 @@ def _iinfo(dtype): [ (lambda dtype: _iinfo(dtype).max, nullcontext()), # in range does not raise (lambda dtype: _iinfo(dtype).min, nullcontext()), # in range does not raise - (lambda dtype: _iinfo(dtype).max + 1, _raises_int_overflow), - (lambda dtype: _iinfo(dtype).min - 1, _raises_int_overflow), + (lambda dtype: _iinfo(dtype).max + 1, _raises_any_integer_cast_exception), + (lambda dtype: _iinfo(dtype).min - 1, _raises_any_integer_cast_exception), ], ) def test_integer_overflow_with_user_dtype(all_parsers, any_int_dtype, getval, expected): @@ -147,7 +154,7 @@ def test_integer_overflow_with_user_dtype(all_parsers, any_int_dtype, getval, ex # int8, int16, int32 only throw a FutureWarning until deprecation from #41734 # becomes enforced. After enforcement, the following block must be deleted. if ( - (expected is _raises_int_overflow) + (expected == _raises_any_integer_cast_exception) and (parser.engine == "python") and (not is_extension_array_dtype(dtype)) and (dtype < np.dtype("int64")) From 2c16f74f05245d30d3acffa4adf3caa304258520 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Thu, 25 Aug 2022 12:30:21 +0000 Subject: [PATCH 23/28] TST: fixed int-overflow test --- pandas/tests/io/parser/common/test_ints.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py index 8cc081a498be6..2f53161bb7454 100644 --- a/pandas/tests/io/parser/common/test_ints.py +++ b/pandas/tests/io/parser/common/test_ints.py @@ -124,12 +124,14 @@ def _iinfo(dtype): _raises_any_integer_cast_exception = pytest.raises( # noqa: PDF010 (OverflowError, TypeError, ValueError), - match=( - "(Overflow)|" - "(Python int too large to convert to C long)|" - "(cannot safely cast non-equivalent)|" - "(Integer out of range)|" - "(Unable to convert column)" + match="|".join( + [ + "Overflow", + "cannot safely cast non-equivalent", + "Integer out of range", + "Unable to convert column", + "The elements provided in the data cannot all be casted to the dtype", + ] ), ) @@ -162,13 +164,13 @@ def test_integer_overflow_with_user_dtype(all_parsers, any_int_dtype, getval, ex ): expected = tm.assert_produces_warning( FutureWarning, - match=f"Values are too large to be losslessly cast to {dtype}.", + match=f"Values are too large to be losslessly cast to {np.dtype(dtype)}.", check_stacklevel=False, ) with expected: result = parser.read_csv(StringIO(data), dtype=dtype) - if "result" in locals(): + if isinstance(expected, nullcontext): expected_result = DataFrame({"A": [val]}, dtype=dtype) tm.assert_frame_equal(result, expected_result) From a1a67646b91c0cdf441fb576ce2f6c17e560c128 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Tue, 6 Sep 2022 20:28:52 +0000 Subject: [PATCH 24/28] CLN: create asv input without overflow to prevent potential warnings --- asv_bench/benchmarks/io/csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index d7ad6f3a4dc42..012395b9afe1c 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -287,7 +287,7 @@ def time_read_uint64_na_values(self): class ReadUint8Integers(StringIORewind): def setup(self): - arr = np.arange(10000).astype("uint8") + arr = np.tile(np.arange(256, dtype="uint8"), 50) self.data1 = StringIO("\n".join(arr.astype(str).tolist())) def time_read_uint8(self): From c9e8a926a78c9f7a085fac395e750aa54731b945 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Tue, 6 Sep 2022 20:58:34 +0000 Subject: [PATCH 25/28] DOC: fixed wording in whatsnew --- doc/source/whatsnew/v1.5.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 42ae1ec4dc4b3..1159b53ef2a3a 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -1136,8 +1136,8 @@ I/O - Bug in :func:`read_parquet` with ``use_nullable_dtypes=True`` where ``float64`` dtype was returned instead of nullable ``Float64`` dtype (:issue:`45694`) - Bug in :meth:`DataFrame.to_json` where ``PeriodDtype`` would not make the serialization roundtrip when read back with :meth:`read_json` (:issue:`44720`) - Bug in :func:`read_xml` when reading XML files with Chinese character tags and would raise ``XMLSyntaxError`` (:issue:`47902`) -- Bug in :func:`read_csv` with specified (non-extension) integer ``dtype`` can cause silent overflow or unexpected return dtype (:issue:`47167`) -- Bug in :func:`read_csv` with specified (non-extension) integer ``dtype`` and ``engine="python"`` can cause silent lossy float coercion (:issue:`47167`) +- Bug in :func:`read_csv` with specified numpy integer ``dtype`` can cause silent overflow or unexpected return dtype (:issue:`47167`) +- Bug in :func:`read_csv` with specified numpy integer ``dtype`` and ``engine="python"`` can cause silent lossy float coercion (:issue:`47167`) Period ^^^^^^ From 39b5c91171bd5da6876f95c077c74c4597f645b1 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Tue, 6 Sep 2022 21:13:17 +0000 Subject: [PATCH 26/28] TST: split float to int coercion test into two separate tests --- pandas/tests/io/parser/common/test_ints.py | 29 +++++++++++----------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py index 2f53161bb7454..039754d9f7c67 100644 --- a/pandas/tests/io/parser/common/test_ints.py +++ b/pandas/tests/io/parser/common/test_ints.py @@ -176,23 +176,24 @@ def test_integer_overflow_with_user_dtype(all_parsers, any_int_dtype, getval, ex @skip_pyarrow -@pytest.mark.parametrize( - "val,expected", - [ - (0.0, nullcontext()), # lossless conversion does not raise - (0.1, pytest.raises((TypeError, ValueError), match=None)), # noqa: PDF010 - ], -) -def test_integer_from_float_raises(all_parsers, any_int_dtype, val, expected): +def test_integer_from_float_lossless(all_parsers, any_int_dtype): dtype = any_int_dtype parser = all_parsers - data = f"A\n0\n{val}" + data = "A\n0\n0.0" - with expected: - result = parser.read_csv(StringIO(data), dtype=dtype) - if "result" in locals(): - expected_result = DataFrame({"A": [0, val]}, dtype=dtype) - tm.assert_frame_equal(result, expected_result) + result = parser.read_csv(StringIO(data), dtype=dtype) + expected_result = DataFrame({"A": [0, 0]}, dtype=dtype) + tm.assert_frame_equal(result, expected_result) + + +@skip_pyarrow +def test_integer_from_float_lossy(all_parsers, any_int_dtype): + dtype = any_int_dtype + parser = all_parsers + data = "A\n0\n0.1" + + with pytest.raises((TypeError, ValueError), match=None): + parser.read_csv(StringIO(data), dtype=dtype) def test_int64_min_issues(all_parsers): From 92fab5955375267548c80848fdd622f53731c474 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Tue, 6 Sep 2022 21:33:41 +0000 Subject: [PATCH 27/28] TST: improved comment and referenced issue --- pandas/tests/io/parser/common/test_ints.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py index 039754d9f7c67..0f2292b4906c8 100644 --- a/pandas/tests/io/parser/common/test_ints.py +++ b/pandas/tests/io/parser/common/test_ints.py @@ -147,14 +147,14 @@ def _iinfo(dtype): ], ) def test_integer_overflow_with_user_dtype(all_parsers, any_int_dtype, getval, expected): + # see GH-47167 dtype = any_int_dtype parser = all_parsers val = getval(dtype) data = f"A\n{val}" - # Positive value overflow with uint8, uint16, uint32 and any overflow with - # int8, int16, int32 only throw a FutureWarning until deprecation from #41734 - # becomes enforced. After enforcement, the following block must be deleted. + # Specific case has intended behavior only after deprecation from #41734 becomes + # enforced. Until then, only expect a FutureWarning. if ( (expected == _raises_any_integer_cast_exception) and (parser.engine == "python") From f653e96a43ebeca1399df38fc5563ac108f79d11 Mon Sep 17 00:00:00 2001 From: SandroCasagrande Date: Tue, 6 Sep 2022 21:53:34 +0000 Subject: [PATCH 28/28] TST: aviod conditional raise --- pandas/tests/io/parser/common/test_ints.py | 59 +++++++++++++--------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py index 0f2292b4906c8..4d6d4cc4eb569 100644 --- a/pandas/tests/io/parser/common/test_ints.py +++ b/pandas/tests/io/parser/common/test_ints.py @@ -2,7 +2,6 @@ Tests that work on both the Python and C engines but do not have a specific classification into the other test modules. """ -from contextlib import nullcontext from io import StringIO import numpy as np @@ -122,42 +121,57 @@ def _iinfo(dtype): return iinfo -_raises_any_integer_cast_exception = pytest.raises( # noqa: PDF010 - (OverflowError, TypeError, ValueError), - match="|".join( - [ - "Overflow", - "cannot safely cast non-equivalent", - "Integer out of range", - "Unable to convert column", - "The elements provided in the data cannot all be casted to the dtype", - ] - ), +@skip_pyarrow +@pytest.mark.parametrize( + "getval", + [ + (lambda dtype: _iinfo(dtype).max), + (lambda dtype: _iinfo(dtype).min), + ], ) +def test_integer_limits_with_user_dtype(all_parsers, any_int_dtype, getval): + dtype = any_int_dtype + parser = all_parsers + val = getval(dtype) + data = f"A\n{val}" + + result = parser.read_csv(StringIO(data), dtype=dtype) + expected_result = DataFrame({"A": [val]}, dtype=dtype) + tm.assert_frame_equal(result, expected_result) @skip_pyarrow @pytest.mark.parametrize( - "getval,expected", + "getval", [ - (lambda dtype: _iinfo(dtype).max, nullcontext()), # in range does not raise - (lambda dtype: _iinfo(dtype).min, nullcontext()), # in range does not raise - (lambda dtype: _iinfo(dtype).max + 1, _raises_any_integer_cast_exception), - (lambda dtype: _iinfo(dtype).min - 1, _raises_any_integer_cast_exception), + (lambda dtype: _iinfo(dtype).max + 1), + (lambda dtype: _iinfo(dtype).min - 1), ], ) -def test_integer_overflow_with_user_dtype(all_parsers, any_int_dtype, getval, expected): +def test_integer_overflow_with_user_dtype(all_parsers, any_int_dtype, getval): # see GH-47167 dtype = any_int_dtype parser = all_parsers val = getval(dtype) data = f"A\n{val}" + expected = pytest.raises( # noqa: PDF010 + (OverflowError, TypeError, ValueError), + match="|".join( + [ + "Overflow", + "cannot safely cast non-equivalent", + "Integer out of range", + "Unable to convert column", + "The elements provided in the data cannot all be casted to the dtype", + ] + ), + ) + # Specific case has intended behavior only after deprecation from #41734 becomes # enforced. Until then, only expect a FutureWarning. if ( - (expected == _raises_any_integer_cast_exception) - and (parser.engine == "python") + (parser.engine == "python") and (not is_extension_array_dtype(dtype)) and (dtype < np.dtype("int64")) and not (is_unsigned_integer_dtype(dtype) and (val < 0)) @@ -169,10 +183,7 @@ def test_integer_overflow_with_user_dtype(all_parsers, any_int_dtype, getval, ex ) with expected: - result = parser.read_csv(StringIO(data), dtype=dtype) - if isinstance(expected, nullcontext): - expected_result = DataFrame({"A": [val]}, dtype=dtype) - tm.assert_frame_equal(result, expected_result) + parser.read_csv(StringIO(data), dtype=dtype) @skip_pyarrow