From 6ac82f8c1bd9070c8537a950205ed3741c539b7e Mon Sep 17 00:00:00 2001 From: Valentin Iovene Date: Sun, 29 Mar 2020 15:50:19 +0200 Subject: [PATCH 01/16] BUG: fix GH33113, conversion of empty DataFrame to SparseDtype --- pandas/tests/extension/base/dtype.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index 154fcdc38826d..f196c9e80910c 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -123,3 +123,7 @@ def test_get_common_dtype(self, dtype): # still testing as good practice to have this working (and it is the # only case we can test in general) assert dtype._get_common_dtype([dtype]) == dtype + + def test_astype_empty_dataframe(self, dtype): + empty_dataframe = pd.DataFrame() + assert empty_dataframe.astype(dtype) == empty_dataframe From e76f1fc1ad060c66ee5f75ad28936d3f9d26ea5e Mon Sep 17 00:00:00 2001 From: Valentin Iovene Date: Tue, 31 Mar 2020 00:51:05 +0200 Subject: [PATCH 02/16] return empty dataframe when nothing to concatenate --- pandas/core/generic.py | 3 +++ pandas/tests/extension/base/dtype.py | 20 ++++++++++++++++---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1404d225eea97..c2bbfa07913ef 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5548,6 +5548,9 @@ def astype( new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,) return self._constructor(new_data).__finalize__(self, method="astype") + if not results: + return pd.DataFrame() + # GH 19920: retain column metadata after concat result = pd.concat(results, axis=1, copy=False) result.columns = self.columns diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index f196c9e80910c..f680c6282a65d 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -4,6 +4,7 @@ import pytest import pandas as pd +import pandas._testing as tm from .base import BaseExtensionTests @@ -68,8 +69,17 @@ def test_check_dtype(self, data): {"A": pd.Series(data, dtype=dtype), "B": data, "C": "foo", "D": 1} ) - # TODO(numpy-1.20): This warnings filter and if block can be removed - # once we require numpy>=1.20 + # np.dtype('int64') == 'Int64' == 'int64' + # so can't distinguish + if dtype.name == "Int64": + expected = pd.Series([True, True, False, True], index=list("ABCD")) + else: + expected = pd.Series( + [True, True, False, False], index=list("ABCD") + ) + + # XXX: This should probably be *fixed* not ignored. + # See libops.scalar_compare with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) result = df.dtypes == str(dtype) @@ -106,7 +116,9 @@ def test_construct_from_string(self, dtype): assert isinstance(dtype_instance, type(dtype)) def test_construct_from_string_another_type_raises(self, dtype): - msg = f"Cannot construct a '{type(dtype).__name__}' from 'another_type'" + msg = ( + f"Cannot construct a '{type(dtype).__name__}' from 'another_type'" + ) with pytest.raises(TypeError, match=msg): type(dtype).construct_from_string("another_type") @@ -126,4 +138,4 @@ def test_get_common_dtype(self, dtype): def test_astype_empty_dataframe(self, dtype): empty_dataframe = pd.DataFrame() - assert empty_dataframe.astype(dtype) == empty_dataframe + tm.assert_frame_equal(empty_dataframe.astype(dtype), empty_dataframe) From 8a96a476f2b8523839c01e646d6511daf0e27a02 Mon Sep 17 00:00:00 2001 From: Valentin Iovene Date: Tue, 31 Mar 2020 00:53:45 +0200 Subject: [PATCH 03/16] revert black formatting --- pandas/tests/extension/base/dtype.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index f680c6282a65d..644c5491a8f17 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -74,9 +74,7 @@ def test_check_dtype(self, data): if dtype.name == "Int64": expected = pd.Series([True, True, False, True], index=list("ABCD")) else: - expected = pd.Series( - [True, True, False, False], index=list("ABCD") - ) + expected = pd.Series([True, True, False, False], index=list("ABCD")) # XXX: This should probably be *fixed* not ignored. # See libops.scalar_compare @@ -116,9 +114,7 @@ def test_construct_from_string(self, dtype): assert isinstance(dtype_instance, type(dtype)) def test_construct_from_string_another_type_raises(self, dtype): - msg = ( - f"Cannot construct a '{type(dtype).__name__}' from 'another_type'" - ) + msg = f"Cannot construct a '{type(dtype).__name__}' from 'another_type'" with pytest.raises(TypeError, match=msg): type(dtype).construct_from_string("another_type") From a4aad1444cea0190af0a5b2589e7c3409be1e44c Mon Sep 17 00:00:00 2001 From: Valentin Iovene Date: Sun, 21 Jun 2020 15:49:49 +0200 Subject: [PATCH 04/16] restore-to-master --- pandas/tests/extension/base/dtype.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index 644c5491a8f17..154fcdc38826d 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -4,7 +4,6 @@ import pytest import pandas as pd -import pandas._testing as tm from .base import BaseExtensionTests @@ -69,15 +68,8 @@ def test_check_dtype(self, data): {"A": pd.Series(data, dtype=dtype), "B": data, "C": "foo", "D": 1} ) - # np.dtype('int64') == 'Int64' == 'int64' - # so can't distinguish - if dtype.name == "Int64": - expected = pd.Series([True, True, False, True], index=list("ABCD")) - else: - expected = pd.Series([True, True, False, False], index=list("ABCD")) - - # XXX: This should probably be *fixed* not ignored. - # See libops.scalar_compare + # TODO(numpy-1.20): This warnings filter and if block can be removed + # once we require numpy>=1.20 with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) result = df.dtypes == str(dtype) @@ -131,7 +123,3 @@ def test_get_common_dtype(self, dtype): # still testing as good practice to have this working (and it is the # only case we can test in general) assert dtype._get_common_dtype([dtype]) == dtype - - def test_astype_empty_dataframe(self, dtype): - empty_dataframe = pd.DataFrame() - tm.assert_frame_equal(empty_dataframe.astype(dtype), empty_dataframe) From d42709b3e1c0a3e699102fc1942fb721f045ceae Mon Sep 17 00:00:00 2001 From: Valentin Iovene Date: Sun, 21 Jun 2020 15:55:06 +0200 Subject: [PATCH 05/16] return instance of class to handle frame or series --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c2bbfa07913ef..bdda655193838 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5549,7 +5549,7 @@ def astype( return self._constructor(new_data).__finalize__(self, method="astype") if not results: - return pd.DataFrame() + return self.__class__() # GH 19920: retain column metadata after concat result = pd.concat(results, axis=1, copy=False) From 708fdc71951991e4c26bfdf13e971f9cd481bd6b Mon Sep 17 00:00:00 2001 From: Valentin Iovene Date: Sun, 21 Jun 2020 15:59:49 +0200 Subject: [PATCH 06/16] add test --- pandas/tests/extension/test_sparse.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index f318934ef5e52..f62e7defa46c9 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -358,6 +358,10 @@ def test_astype_str(self, data): def test_astype_string(self, data): super().test_astype_string(data) + def test_astype_empty_frame(self, dtype): + empty_df = pd.DataFrame() + tm.assert_frame_equal(empty_df.astype(dtype), empty_df) + class TestArithmeticOps(BaseSparseTests, base.BaseArithmeticOpsTests): series_scalar_exc = None From 5b1fe9f3f8809b99b0a379ad67d643ad6d0280ab Mon Sep 17 00:00:00 2001 From: Valentin Iovene Date: Sun, 21 Jun 2020 16:18:31 +0200 Subject: [PATCH 07/16] use type(self) instead of self.__class__ --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bdda655193838..17d25cb2ad9ce 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5549,7 +5549,7 @@ def astype( return self._constructor(new_data).__finalize__(self, method="astype") if not results: - return self.__class__() + return type(self)() # GH 19920: retain column metadata after concat result = pd.concat(results, axis=1, copy=False) From 738739e7274fc4f6d56c62ad229d7cea8d341db8 Mon Sep 17 00:00:00 2001 From: Valentin Iovene Date: Sun, 21 Jun 2020 16:19:38 +0200 Subject: [PATCH 08/16] return self.copy() --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 17d25cb2ad9ce..bf48410cd4815 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5549,7 +5549,7 @@ def astype( return self._constructor(new_data).__finalize__(self, method="astype") if not results: - return type(self)() + return self.copy() # GH 19920: retain column metadata after concat result = pd.concat(results, axis=1, copy=False) From a9e7e15c9503b4e7bca35adf4cb2b890df245493 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 21 Jun 2020 19:47:15 +0100 Subject: [PATCH 09/16] move test to extension array tests --- pandas/tests/extension/base/casting.py | 6 ++++++ pandas/tests/extension/test_sparse.py | 4 ---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 567a62a8b33a5..d019df6146eb9 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -50,3 +50,9 @@ def test_to_numpy(self, data): result = pd.Series(data).to_numpy() self.assert_equal(result, expected) + + def test_astype_empty_dataframe(self, dtype): + # https://github.com/pandas-dev/pandas/issues/33113 + df = pd.DataFrame() + result = df.astype(dtype) + self.assert_frame_equal(result, df) \ No newline at end of file diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index f62e7defa46c9..f318934ef5e52 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -358,10 +358,6 @@ def test_astype_str(self, data): def test_astype_string(self, data): super().test_astype_string(data) - def test_astype_empty_frame(self, dtype): - empty_df = pd.DataFrame() - tm.assert_frame_equal(empty_df.astype(dtype), empty_df) - class TestArithmeticOps(BaseSparseTests, base.BaseArithmeticOpsTests): series_scalar_exc = None From 9c7f06b38b7b6f66616eb0ee89238349b2c3be19 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 21 Jun 2020 19:48:30 +0100 Subject: [PATCH 10/16] black fixup --- pandas/tests/extension/base/casting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index d019df6146eb9..3aaf040a4279b 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -55,4 +55,4 @@ def test_astype_empty_dataframe(self, dtype): # https://github.com/pandas-dev/pandas/issues/33113 df = pd.DataFrame() result = df.astype(dtype) - self.assert_frame_equal(result, df) \ No newline at end of file + self.assert_frame_equal(result, df) From 49280cbd8d11850b154c734713cc914aef8d944c Mon Sep 17 00:00:00 2001 From: Valentin Iovene Date: Sun, 21 Jun 2020 20:57:25 +0200 Subject: [PATCH 11/16] add whatsnew entry --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 567b6853bd633..17bb380a418e4 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1068,6 +1068,7 @@ Sparse - Bug in :meth:`Series.sum` with ``SparseArray`` raises ``TypeError`` (:issue:`25777`) - Bug where :class:`DataFrame` containing :class:`SparseArray` filled with ``NaN`` when indexed by a list-like (:issue:`27781`, :issue:`29563`) - The repr of :class:`SparseDtype` now includes the repr of its ``fill_value`` attribute. Previously it used ``fill_value``'s string representation (:issue:`34352`) +- Fixed bug where :class:`DataFrame` or :class:`Series` could not be cast to :class:`SparseDtype` when empty (:issue:`33113`) ExtensionArray ^^^^^^^^^^^^^^ From f0f099c95b6b077153557a9cb50194981c521a5e Mon Sep 17 00:00:00 2001 From: Valentin Iovene Date: Sun, 21 Jun 2020 22:02:09 +0200 Subject: [PATCH 12/16] address @simonjayhawkins's comments --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/generic.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 17bb380a418e4..3ff4e4e7da636 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1068,7 +1068,7 @@ Sparse - Bug in :meth:`Series.sum` with ``SparseArray`` raises ``TypeError`` (:issue:`25777`) - Bug where :class:`DataFrame` containing :class:`SparseArray` filled with ``NaN`` when indexed by a list-like (:issue:`27781`, :issue:`29563`) - The repr of :class:`SparseDtype` now includes the repr of its ``fill_value`` attribute. Previously it used ``fill_value``'s string representation (:issue:`34352`) -- Fixed bug where :class:`DataFrame` or :class:`Series` could not be cast to :class:`SparseDtype` when empty (:issue:`33113`) +- Bug where empty :class:`DataFrame` could not be cast to :class:`SparseDtype` (:issue:`33113`) ExtensionArray ^^^^^^^^^^^^^^ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bf48410cd4815..c4559262fbb40 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5542,15 +5542,15 @@ def astype( self.iloc[:, i].astype(dtype, copy=copy) for i in range(len(self.columns)) ] + # GH 33113: handle empty frame or series + if not results: + return self.copy() else: # else, only a single dtype is given new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,) return self._constructor(new_data).__finalize__(self, method="astype") - if not results: - return self.copy() - # GH 19920: retain column metadata after concat result = pd.concat(results, axis=1, copy=False) result.columns = self.columns From dcdf11ff2ff2a01b566b61668db453ae9e017ec8 Mon Sep 17 00:00:00 2001 From: Valentin Iovene Date: Mon, 22 Jun 2020 17:16:07 +0200 Subject: [PATCH 13/16] revert back to previous solution + add test --- pandas/core/generic.py | 7 ++++--- pandas/tests/extension/base/casting.py | 7 +++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c4559262fbb40..0fd4c6c8c1fee 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5542,15 +5542,16 @@ def astype( self.iloc[:, i].astype(dtype, copy=copy) for i in range(len(self.columns)) ] - # GH 33113: handle empty frame or series - if not results: - return self.copy() else: # else, only a single dtype is given new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,) return self._constructor(new_data).__finalize__(self, method="astype") + # GH 33113: handle empty frame or series + if not results: + return self.copy() + # GH 19920: retain column metadata after concat result = pd.concat(results, axis=1, copy=False) result.columns = self.columns diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 3aaf040a4279b..8a6dc5c2c1764 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -56,3 +56,10 @@ def test_astype_empty_dataframe(self, dtype): df = pd.DataFrame() result = df.astype(dtype) self.assert_frame_equal(result, df) + + def test_astype_empty_dataframe_empty_dict(self, dtype): + # issue mentioned further down in the following issue's thread + # https://github.com/pandas-dev/pandas/issues/33113 + df = pd.DataFrame() + result = df.astype(dict()) + self.assert_frame_equal(result, df) From e2d4241016d7270bd360b9dda01064ac70dbf1bc Mon Sep 17 00:00:00 2001 From: Valentin Iovene Date: Mon, 22 Jun 2020 17:28:16 +0200 Subject: [PATCH 14/16] move test --- pandas/tests/extension/base/casting.py | 7 ------- pandas/tests/frame/methods/test_astype.py | 7 +++++++ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 8a6dc5c2c1764..3aaf040a4279b 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -56,10 +56,3 @@ def test_astype_empty_dataframe(self, dtype): df = pd.DataFrame() result = df.astype(dtype) self.assert_frame_equal(result, df) - - def test_astype_empty_dataframe_empty_dict(self, dtype): - # issue mentioned further down in the following issue's thread - # https://github.com/pandas-dev/pandas/issues/33113 - df = pd.DataFrame() - result = df.astype(dict()) - self.assert_frame_equal(result, df) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index b06c3d72a2c77..4300d273a74ec 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -557,3 +557,10 @@ def test_astype_dt64tz_to_str(self, timezone_frame): assert ( "2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00" ) in result + + def test_astype_empty_dtype_dict(self, dtype): + # issue mentioned further down in the following issue's thread + # https://github.com/pandas-dev/pandas/issues/33113 + df = pd.DataFrame() + result = df.astype(dict()) + self.assert_frame_equal(result, df) From 7bc6255f750b32a475dad17bd44fa0d5d7bae732 Mon Sep 17 00:00:00 2001 From: Valentin Iovene Date: Mon, 22 Jun 2020 17:29:09 +0200 Subject: [PATCH 15/16] fix test --- pandas/tests/frame/methods/test_astype.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 4300d273a74ec..42d114748e769 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -558,7 +558,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame): "2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00" ) in result - def test_astype_empty_dtype_dict(self, dtype): + def test_astype_empty_dtype_dict(self): # issue mentioned further down in the following issue's thread # https://github.com/pandas-dev/pandas/issues/33113 df = pd.DataFrame() From 95fd399a40770240c154abf8fa37de6ae0cd2aea Mon Sep 17 00:00:00 2001 From: Valentin Iovene Date: Mon, 22 Jun 2020 18:03:09 +0200 Subject: [PATCH 16/16] fix test again --- pandas/tests/frame/methods/test_astype.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 42d114748e769..b0fd0496ea81e 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -561,6 +561,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame): def test_astype_empty_dtype_dict(self): # issue mentioned further down in the following issue's thread # https://github.com/pandas-dev/pandas/issues/33113 - df = pd.DataFrame() + df = DataFrame() result = df.astype(dict()) - self.assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) + assert result is not df