diff --git a/RELEASE.rst b/RELEASE.rst index 8894df02ed989..c6abe3575277e 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -148,6 +148,7 @@ pandas 0.11.0 - Bug in DataFrame update where non-specified values could cause dtype changes (GH3016_) - Formatting of an index that has ``nan`` was inconsistent or wrong (would fill from other values), (GH2850_) + - Unstack of a frame with no nans would always cause dtype upcasting (GH2929_) .. _GH622: https://github.com/pydata/pandas/issues/622 .. _GH797: https://github.com/pydata/pandas/issues/797 @@ -169,6 +170,7 @@ pandas 0.11.0 .. _GH2892: https://github.com/pydata/pandas/issues/2892 .. _GH2909: https://github.com/pydata/pandas/issues/2909 .. _GH2922: https://github.com/pydata/pandas/issues/2922 +.. _GH2929: https://github.com/pydata/pandas/issues/2929 .. _GH2931: https://github.com/pydata/pandas/issues/2931 .. _GH2973: https://github.com/pydata/pandas/issues/2973 .. _GH2967: https://github.com/pydata/pandas/issues/2967 diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index c86273b8a1cca..4598b37d7da6a 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -144,15 +144,23 @@ def get_result(self): def get_new_values(self): values = self.values + # place the values length, width = self.full_shape stride = values.shape[1] result_width = width * stride + result_shape = (length, result_width) - dtype, fill_value = _maybe_promote(values.dtype) - new_values = np.empty((length, result_width), dtype=dtype) - new_values.fill(fill_value) - new_mask = np.zeros((length, result_width), dtype=bool) + # if our mask is all True, then we can use our existing dtype + if self.mask.all(): + dtype = values.dtype + new_values = np.empty(result_shape, dtype=dtype) + else: + dtype, fill_value = _maybe_promote(values.dtype) + new_values = np.empty(result_shape, dtype=dtype) + new_values.fill(fill_value) + + new_mask = np.zeros(result_shape, dtype=bool) # is there a simpler / faster way of doing this? for i in xrange(values.shape[1]): diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index cba6adadb8d6c..3f13df5ce0415 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -8242,6 +8242,41 @@ def test_unstack_to_series(self): data = data.unstack() assert_frame_equal(old_data, data) + def test_unstack_dtypes(self): + + # GH 2929 + rows = [[1, 1, 3, 4], + [1, 2, 3, 4], + [2, 1, 3, 4], + [2, 2, 3, 4]] + + df = DataFrame(rows, columns=list('ABCD')) + result = df.get_dtype_counts() + expected = Series({'int64' : 4}) + assert_series_equal(result, expected) + + # single dtype + df2 = df.set_index(['A','B']) + df3 = df2.unstack('B') + result = df3.get_dtype_counts() + expected = Series({'int64' : 4}) + assert_series_equal(result, expected) + + # mixed + df2 = df.set_index(['A','B']) + df2['C'] = 3. + df3 = df2.unstack('B') + result = df3.get_dtype_counts() + expected = Series({'int64' : 2, 'float64' : 2}) + assert_series_equal(result, expected) + + df2['D'] = 'foo' + df3 = df2.unstack('B') + result = df3.get_dtype_counts() + expected = Series({'float64' : 2, 'object' : 2}) + assert_series_equal(result, expected) + + def test_reset_index(self): stacked = self.frame.stack()[::2] stacked = DataFrame({'foo': stacked, 'bar': stacked}) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 99c081c0cc6cb..c93dcf386e1c9 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1346,7 +1346,7 @@ def test_unstack_group_index_overflow(self): # test roundtrip stacked = result.stack() - assert_series_equal(s.astype(np.float64), + assert_series_equal(s, stacked.reindex(s.index)) # put it at beginning