From c34d6343dfc3b0508f764bb1202ce3fe6cb05b76 Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 20 Dec 2013 08:04:35 -0500 Subject: [PATCH 1/2] BUG: construction of DataFrame from empty Series regression (GH5756) --- doc/source/release.rst | 3 ++- pandas/core/frame.py | 7 ++++--- pandas/tests/test_indexing.py | 8 ++++++++ 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 8ac168e18233f..a2d6ae61bd064 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -247,7 +247,8 @@ API Changes (:issue:`4390`) - allow ``ix/loc`` for Series/DataFrame/Panel to set on any axis even when the single-key is not currently contained in the index for that axis - (:issue:`2578`, :issue:`5226`, :issue:`5632`, :issue:`5720`, :issue:`5744`) + (:issue:`2578`, :issue:`5226`, :issue:`5632`, :issue:`5720`, + :issue:`5744`, :issue:`5756`) - Default export for ``to_clipboard`` is now csv with a sep of `\t` for compat (:issue:`3368`) - ``at`` now will enlarge the object inplace (and return the same) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8a3869d15c85f..ed6d7fef4dd66 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -325,15 +325,16 @@ def _init_dict(self, data, index, columns, dtype=None): def _init_ndarray(self, values, index, columns, dtype=None, copy=False): if isinstance(values, Series): - if columns is None and values.name is not None: - columns = [values.name] + if columns is None: + if values.name is not None: + columns = [values.name] if index is None: index = values.index else: values = values.reindex(index) # zero len case (GH #2234) - if not len(values) and len(columns): + if not len(values) and columns is not None and len(columns): values = np.empty((0, 1), dtype=object) values = _prep_ndarray(values, copy=copy) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index c7fb209b4aacb..f4e203444acfc 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -1793,6 +1793,14 @@ def f(): expected = DataFrame(columns=['A','B','C']) assert_frame_equal(result,expected) + # GH 5756 + # setting with empty Series + df = DataFrame(Series()) + assert_frame_equal(df, DataFrame({ 0 : Series() })) + + df = DataFrame(Series(name='foo')) + assert_frame_equal(df, DataFrame({ 'foo' : Series() })) + def test_cache_updating(self): # GH 4939, make sure to update the cache on setitem From 4f5994e764607d506fd9f42b5e35a6d8791d1c5a Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 20 Dec 2013 08:41:19 -0500 Subject: [PATCH 2/2] BUG: Row-wise concat of differeing dtypes failing in certain cases (GH5754) --- doc/source/release.rst | 1 + pandas/tests/test_frame.py | 42 +++++++++++++++++++++++++++++ pandas/tools/merge.py | 55 ++++++++++++++++++++------------------ 3 files changed, 72 insertions(+), 26 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index a2d6ae61bd064..173d03f9be3c8 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -828,6 +828,7 @@ Bug Fixes - Bug in fillna with Series and a passed series/dict (:issue:`5703`) - Bug in groupby transform with a datetime-like grouper (:issue:`5712`) - Bug in multi-index selection in PY3 when using certain keys (:issue:`5725`) + - Row-wise concat of differeing dtypes failing in certain cases (:issue:`5754`) pandas 0.12.0 ------------- diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 3a29fa41046ca..5e00d14a0e0cb 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -6154,6 +6154,48 @@ def test_append_empty_dataframe(self): expected = df1.copy() assert_frame_equal(result, expected) + def test_append_dtypes(self): + + # GH 5754 + # row appends of different dtypes (so need to do by-item) + # can sometimes infer the correct type + + df1 = DataFrame({ 'bar' : Timestamp('20130101') }, index=lrange(5)) + df2 = DataFrame() + result = df1.append(df2) + expected = df1.copy() + assert_frame_equal(result, expected) + + df1 = DataFrame({ 'bar' : Timestamp('20130101') }, index=lrange(1)) + df2 = DataFrame({ 'bar' : 'foo' }, index=lrange(1,2)) + result = df1.append(df2) + expected = DataFrame({ 'bar' : [ Timestamp('20130101'), 'foo' ]}) + assert_frame_equal(result, expected) + + df1 = DataFrame({ 'bar' : Timestamp('20130101') }, index=lrange(1)) + df2 = DataFrame({ 'bar' : np.nan }, index=lrange(1,2)) + result = df1.append(df2) + expected = DataFrame({ 'bar' : Series([ Timestamp('20130101'), np.nan ],dtype='M8[ns]') }) + assert_frame_equal(result, expected) + + df1 = DataFrame({ 'bar' : Timestamp('20130101') }, index=lrange(1)) + df2 = DataFrame({ 'bar' : np.nan }, index=lrange(1,2), dtype=object) + result = df1.append(df2) + expected = DataFrame({ 'bar' : Series([ Timestamp('20130101'), np.nan ],dtype='M8[ns]') }) + assert_frame_equal(result, expected) + + df1 = DataFrame({ 'bar' : np.nan }, index=lrange(1)) + df2 = DataFrame({ 'bar' : Timestamp('20130101') }, index=lrange(1,2)) + result = df1.append(df2) + expected = DataFrame({ 'bar' : Series([ np.nan, Timestamp('20130101')] ,dtype='M8[ns]') }) + assert_frame_equal(result, expected) + + df1 = DataFrame({ 'bar' : Timestamp('20130101') }, index=lrange(1)) + df2 = DataFrame({ 'bar' : 1 }, index=lrange(1,2), dtype=object) + result = df1.append(df2) + expected = DataFrame({ 'bar' : Series([ Timestamp('20130101'), 1 ]) }) + assert_frame_equal(result, expected) + def test_asfreq(self): offset_monthly = self.tsframe.asfreq(datetools.bmonthEnd) rule_monthly = self.tsframe.asfreq('BM') diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index c76bdea950650..dd7ab65869303 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -1139,52 +1139,55 @@ def _concat_blocks(self, blocks): def _concat_single_item(self, objs, item): # this is called if we don't have consistent dtypes in a row-wise append - all_values = [] - dtypes = set() + dtypes = [] + alls = set() + # figure out the resulting dtype of the combination for data, orig in zip(objs, self.objs): + d = dict([ (t,False) for t in ['object','datetime','timedelta','other'] ]) if item in orig: values = data.get(item) if hasattr(values,'to_dense'): values = values.to_dense() - dtypes.add(values.dtype) all_values.append(values) - else: - all_values.append(None) - # figure out the resulting dtype of the combination - alls = set() - seen = [] - for dtype in dtypes: - d = dict([ (t,False) for t in ['object','datetime','timedelta','other'] ]) - if issubclass(dtype.type, (np.object_, np.bool_)): - d['object'] = True - alls.add('object') - elif is_datetime64_dtype(dtype): - d['datetime'] = True - alls.add('datetime') - elif is_timedelta64_dtype(dtype): - d['timedelta'] = True - alls.add('timedelta') + dtype = values.dtype + + if issubclass(dtype.type, (np.object_, np.bool_)): + d['object'] = True + alls.add('object') + elif is_datetime64_dtype(dtype): + d['datetime'] = True + alls.add('datetime') + elif is_timedelta64_dtype(dtype): + d['timedelta'] = True + alls.add('timedelta') + else: + d['other'] = True + alls.add('other') + else: + all_values.append(None) d['other'] = True alls.add('other') - seen.append(d) + + dtypes.append(d) if 'datetime' in alls or 'timedelta' in alls: if 'object' in alls or 'other' in alls: - for v, s in zip(all_values,seen): - if s.get('datetime') or s.get('timedelta'): + + for v, d in zip(all_values,dtypes): + if d.get('datetime') or d.get('timedelta'): pass # if we have all null, then leave a date/time like type # if we have only that type left - elif isnull(v).all(): + elif v is None or isnull(v).all(): - alls.remove('other') - alls.remove('object') + alls.discard('other') + alls.discard('object') # create the result if 'object' in alls: @@ -1200,7 +1203,7 @@ def _concat_single_item(self, objs, item): to_concat = [] for obj, item_values in zip(objs, all_values): - if item_values is None: + if item_values is None or isnull(item_values).all(): shape = obj.shape[1:] missing_arr = np.empty(shape, dtype=empty_dtype) missing_arr.fill(fill_value)