From be667860a0518d0c4d9b459354d8e944f7d0977f Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 11 Mar 2018 18:34:02 -0700 Subject: [PATCH 1/4] BUG: Retain tz-aware dtypes with melt Add additional tests --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/reshape/melt.py | 10 +++++++++- pandas/tests/reshape/test_melt.py | 32 +++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index f686a042c1a74..5ecae11fea654 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -896,6 +896,7 @@ Timezones - Bug in :func:`Timestamp.tz_localize` where localizing a timestamp near the minimum or maximum valid values could overflow and return a timestamp with an incorrect nanosecond value (:issue:`12677`) - Bug when iterating over :class:`DatetimeIndex` that was localized with fixed timezone offset that rounded nanosecond precision to microseconds (:issue:`19603`) - Bug in :func:`DataFrame.diff` that raised an ``IndexError`` with tz-aware values (:issue:`18578`) +- Bug in :func:`melt` that coverted tz-aware dtypes to tz-naive (:issue:`15785`) Offsets ^^^^^^^ diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 01445eb30a9e5..421e7a87df28a 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -13,7 +13,9 @@ import re from pandas.core.dtypes.missing import notna +from pandas.core.dtypes.common import is_extension_type from pandas.core.tools.numeric import to_numeric +from pandas.core.reshape.concat import concat @Appender(_shared_docs['melt'] % @@ -70,7 +72,13 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, mdata = {} for col in id_vars: - mdata[col] = np.tile(frame.pop(col).values, K) + id_data = frame.pop(col) + if is_extension_type(id_data): + # Preserve pandas dtype by not converting to a numpy array + id_data = concat([id_data] * K, ignore_index=True) + else: + id_data = np.tile(id_data.values, K) + mdata[col] = id_data mcolumns = id_vars + var_name + [value_name] diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 000b22d4fdd36..5a2f74ce41d89 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -212,6 +212,38 @@ def test_multiindex(self): res = self.df1.melt() assert res.columns.tolist() == ['CAP', 'low', 'value'] + @pytest.mark.parametrize("col", [ + pd.Series(pd.date_range('2010', periods=5, tz='US/Pacific')), + pd.Series(["a", "b", "c", "a", "d"], dtype="category")]) + def test_pandas_dtypes_id_var(self, col): + # GH 15785 + # Pandas dtype in the id + df = DataFrame({'klass': range(5), + 'col': col, + 'attr1': [1, 0, 0, 0, 0], + 'attr2': [0, 1, 0, 0, 0]}) + result = melt(df, id_vars=['klass', 'col'], var_name='attribute', + value_name='value') + expected = DataFrame({'klass': list(range(5)) * 2, + 'col': pd.concat([col] * 2, ignore_index=True), + 'attribute': ['attr1'] * 5 + ['attr2'] * 5, + 'value': [1, 0, 0, 0, 0] + [0, 1, 0, 0, 0]}) + tm.assert_frame_equal(result, expected) + + # Pandas dtype in the column + df = DataFrame({'klass': range(5), + 'col': col, + 'attr1': [1, 0, 0, 0, 0], + 'attr2': col}) + result = melt(df, id_vars=['klass', 'col'], var_name='attribute', + value_name='value') + expected = DataFrame({'klass': list(range(5)) * 2, + 'col': pd.concat([col] * 2, ignore_index=True), + 'attribute': ['attr1'] * 5 + ['attr2'] * 5, + 'value': pd.concat([pd.Series([1, 0, 0, 0, 0]), + col], ignore_index=True)}) + tm.assert_frame_equal(result, expected) + class TestLreshape(object): From 2d28685bbd10ab0bbd5b6a2d28deba21ec19cd9b Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 11 Mar 2018 18:59:48 -0700 Subject: [PATCH 2/4] clarify comment --- pandas/tests/reshape/test_melt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 5a2f74ce41d89..ca3ad6d74a24c 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -230,7 +230,7 @@ def test_pandas_dtypes_id_var(self, col): 'value': [1, 0, 0, 0, 0] + [0, 1, 0, 0, 0]}) tm.assert_frame_equal(result, expected) - # Pandas dtype in the column + # Pandas dtype in the value df = DataFrame({'klass': range(5), 'col': col, 'attr1': [1, 0, 0, 0, 0], From 78a2d84513866b62aec26ef7bb0a3aa8093efad4 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 11 Mar 2018 22:39:18 -0700 Subject: [PATCH 3/4] Reduce duplication and dict insertion order fix --- pandas/tests/reshape/test_melt.py | 37 +++++++++++++------------------ 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index ca3ad6d74a24c..7c3461b7bcada 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -215,33 +215,28 @@ def test_multiindex(self): @pytest.mark.parametrize("col", [ pd.Series(pd.date_range('2010', periods=5, tz='US/Pacific')), pd.Series(["a", "b", "c", "a", "d"], dtype="category")]) - def test_pandas_dtypes_id_var(self, col): + @pytest.mark.parametrize("pandas_dtype_value", [True, False]) + def test_pandas_dtypes_id_var(self, col, pandas_dtype_value): # GH 15785 # Pandas dtype in the id df = DataFrame({'klass': range(5), 'col': col, - 'attr1': [1, 0, 0, 0, 0], - 'attr2': [0, 1, 0, 0, 0]}) + 'attr1': [1, 0, 0, 0, 0]}) + if pandas_dtype_value: + # Pandas dtype in the value as well + df['attr2'] = col + expected_value = pd.concat([pd.Series([1, 0, 0, 0, 0]), col], + ignore_index=True) + else: + df['attr2'] = [0, 1, 0, 0, 0] + expected_value = [1, 0, 0, 0, 0] + [0, 1, 0, 0, 0] result = melt(df, id_vars=['klass', 'col'], var_name='attribute', value_name='value') - expected = DataFrame({'klass': list(range(5)) * 2, - 'col': pd.concat([col] * 2, ignore_index=True), - 'attribute': ['attr1'] * 5 + ['attr2'] * 5, - 'value': [1, 0, 0, 0, 0] + [0, 1, 0, 0, 0]}) - tm.assert_frame_equal(result, expected) - - # Pandas dtype in the value - df = DataFrame({'klass': range(5), - 'col': col, - 'attr1': [1, 0, 0, 0, 0], - 'attr2': col}) - result = melt(df, id_vars=['klass', 'col'], var_name='attribute', - value_name='value') - expected = DataFrame({'klass': list(range(5)) * 2, - 'col': pd.concat([col] * 2, ignore_index=True), - 'attribute': ['attr1'] * 5 + ['attr2'] * 5, - 'value': pd.concat([pd.Series([1, 0, 0, 0, 0]), - col], ignore_index=True)}) + expected = DataFrame({0: list(range(5)) * 2, + 1: pd.concat([col] * 2, ignore_index=True), + 2: ['attr1'] * 5 + ['attr2'] * 5, + 3: expected_value}) + expected.columns = ['klass', 'col', 'attribute', 'value'] tm.assert_frame_equal(result, expected) From abe48c9ef4c2543bcd53804defea48a9cd423fbc Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 12 Mar 2018 18:58:21 -0700 Subject: [PATCH 4/4] address review --- doc/source/whatsnew/v0.23.0.txt | 2 +- pandas/core/reshape/melt.py | 1 - pandas/tests/reshape/test_melt.py | 20 +++++++------------- 3 files changed, 8 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 5ecae11fea654..791365295c268 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -896,7 +896,7 @@ Timezones - Bug in :func:`Timestamp.tz_localize` where localizing a timestamp near the minimum or maximum valid values could overflow and return a timestamp with an incorrect nanosecond value (:issue:`12677`) - Bug when iterating over :class:`DatetimeIndex` that was localized with fixed timezone offset that rounded nanosecond precision to microseconds (:issue:`19603`) - Bug in :func:`DataFrame.diff` that raised an ``IndexError`` with tz-aware values (:issue:`18578`) -- Bug in :func:`melt` that coverted tz-aware dtypes to tz-naive (:issue:`15785`) +- Bug in :func:`melt` that converted tz-aware dtypes to tz-naive (:issue:`15785`) Offsets ^^^^^^^ diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 421e7a87df28a..ce99d2f8c9a63 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -74,7 +74,6 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, for col in id_vars: id_data = frame.pop(col) if is_extension_type(id_data): - # Preserve pandas dtype by not converting to a numpy array id_data = concat([id_data] * K, ignore_index=True) else: id_data = np.tile(id_data.values, K) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 7c3461b7bcada..81570de7586de 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -214,22 +214,16 @@ def test_multiindex(self): @pytest.mark.parametrize("col", [ pd.Series(pd.date_range('2010', periods=5, tz='US/Pacific')), - pd.Series(["a", "b", "c", "a", "d"], dtype="category")]) - @pytest.mark.parametrize("pandas_dtype_value", [True, False]) - def test_pandas_dtypes_id_var(self, col, pandas_dtype_value): + pd.Series(["a", "b", "c", "a", "d"], dtype="category"), + pd.Series([0, 1, 0, 0, 0])]) + def test_pandas_dtypes(self, col): # GH 15785 - # Pandas dtype in the id df = DataFrame({'klass': range(5), 'col': col, - 'attr1': [1, 0, 0, 0, 0]}) - if pandas_dtype_value: - # Pandas dtype in the value as well - df['attr2'] = col - expected_value = pd.concat([pd.Series([1, 0, 0, 0, 0]), col], - ignore_index=True) - else: - df['attr2'] = [0, 1, 0, 0, 0] - expected_value = [1, 0, 0, 0, 0] + [0, 1, 0, 0, 0] + 'attr1': [1, 0, 0, 0, 0], + 'attr2': col}) + expected_value = pd.concat([pd.Series([1, 0, 0, 0, 0]), col], + ignore_index=True) result = melt(df, id_vars=['klass', 'col'], var_name='attribute', value_name='value') expected = DataFrame({0: list(range(5)) * 2,