diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 72344ee003547..04fb0b0695f8f 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -213,6 +213,33 @@ This is also a valid argument to ``DataFrame.append``: df1.append(df2, ignore_index=True) +.. _merging.mixed_ndims: + +Concatenating with mixed ndims +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can concatenate a mix of Series and DataFrames. The +Series will be transformed to DataFrames with the column name as +the name of the Series. + +.. ipython:: python + + df1 = DataFrame(randn(6, 4), columns=['A', 'B', 'C', 'D']) + s1 = Series(randn(6), name='foo') + concat([df1, s1],axis=1) + +If unnamed Series are passed they will be numbered consecutively. + +.. ipython:: python + + s2 = Series(randn(6)) + concat([df1, s2, s2, s2],axis=1) + +Passing ``ignore_index=True`` will drop all name references. + +.. ipython:: python + + concat([df1, s1],axis=1,ignore_index=True) More concatenating with group keys ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/release.rst b/doc/source/release.rst index 322b05b8d8b31..6fa969ee12295 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -98,6 +98,8 @@ API Changes - The top-level :func:`pandas.eval` function does not allow you use the ``'@'`` prefix and provides you with an error message telling you so. - ``NameResolutionError`` was removed because it isn't necessary anymore. +- ``concat`` will now concatenate mixed Series and DataFrames using the Series name + or numbering columns as needed (:issue:`2385`) Experimental Features ~~~~~~~~~~~~~~~~~~~~~ @@ -166,6 +168,7 @@ Bug Fixes - Bug in ``Series.reindex`` when specifying a ``method`` with some nan values was inconsistent (noted on a resample) (:issue:`6418`) - Bug in :meth:`DataFrame.replace` where nested dicts were erroneously depending on the order of dictionary keys and values (:issue:`5338`). +- Perf issue in concatting with empty objects (:issue:`3259`) pandas 0.13.1 ------------- diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 76ba2dafd69d6..106e0b1f1ec77 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -66,6 +66,8 @@ API changes - The top-level :func:`pandas.eval` function does not allow you use the ``'@'`` prefix and provides you with an error message telling you so. - ``NameResolutionError`` was removed because it isn't necessary anymore. +- ``concat`` will now concatenate mixed Series and DataFrames using the Series name + or numbering columns as needed (:issue:`2385`). See :ref:`the docs ` MultiIndexing Using Slicers ~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 817cf7c5bc155..f0588524e16eb 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2209,10 +2209,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # make Nones an empty object if com._count_not_none(*values) != len(values): - v = None - for v in values: - if v is not None: - break + v = next(v for v in values if v is not None) if v is None: return DataFrame() elif isinstance(v, NDFrame): diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 41a4cf9984c14..90e713d72bdda 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -957,7 +957,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, objs = [objs[k] for k in keys] if keys is None: - objs = [obj for obj in objs if obj is not None] + objs = [obj for obj in objs if obj is not None ] else: # #1649 clean_keys = [] @@ -973,16 +973,43 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, if len(objs) == 0: raise Exception('All objects passed were None') - # consolidate data + # consolidate data & figure out what our result ndim is going to be + ndims = set() for obj in objs: - if isinstance(obj, NDFrame): - obj.consolidate(inplace=True) - self.objs = objs + if not isinstance(obj, NDFrame): + raise TypeError("cannot concatenate a non-NDFrame object") + + # consolidate + obj.consolidate(inplace=True) + ndims.add(obj.ndim) + + # get the sample + # want the higest ndim that we have, and must be non-empty + # unless all objs are empty + sample = None + if len(ndims) > 1: + max_ndim = max(ndims) + for obj in objs: + if obj.ndim == max_ndim and np.sum(obj.shape): + sample = obj + break - sample = objs[0] + else: + # filter out the empties + # if we have not multi-index possibiltes + df = DataFrame([ obj.shape for obj in objs ]).sum(1) + non_empties = df[df!=0] + if len(non_empties) and (keys is None and names is None and levels is None and join_axes is None): + objs = [ objs[i] for i in non_empties.index ] + sample = objs[0] + + if sample is None: + sample = objs[0] + self.objs = objs # Need to flip BlockManager axis in the DataFrame special case - if isinstance(sample, DataFrame): + self._is_frame = isinstance(sample, DataFrame) + if self._is_frame: axis = 1 if axis == 0 else 0 self._is_series = isinstance(sample, ABCSeries) @@ -990,11 +1017,39 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, raise AssertionError("axis must be between 0 and {0}, " "input was {1}".format(sample.ndim, axis)) + # if we have mixed ndims, then convert to highest ndim + # creating column numbers as needed + if len(ndims) > 1: + current_column = 0 + max_ndim = sample.ndim + self.objs, objs = [], self.objs + for obj in objs: + + ndim = obj.ndim + if ndim == max_ndim: + pass + + elif ndim != max_ndim-1: + raise ValueError("cannot concatenate unaligned mixed " + "dimensional NDFrame objects") + + else: + name = getattr(obj,'name',None) + if ignore_index or name is None: + name = current_column + current_column += 1 + + # doing a row-wise concatenation so need everything + # to line up + if self._is_frame and axis == 1: + name = 0 + obj = sample._constructor({ name : obj }) + + self.objs.append(obj) + # note: this is the BlockManager axis (since DataFrame is transposed) self.axis = axis - self.join_axes = join_axes - self.keys = keys self.names = names self.levels = levels diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 3dee4a671e1f9..fe7dfb7c25380 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -1653,6 +1653,77 @@ def test_handle_empty_objects(self): tm.assert_frame_equal(concatted, expected) + # empty as first element with time series + # GH3259 + df = DataFrame(dict(A = range(10000)),index=date_range('20130101',periods=10000,freq='s')) + empty = DataFrame() + result = concat([df,empty],axis=1) + assert_frame_equal(result, df) + result = concat([empty,df],axis=1) + assert_frame_equal(result, df) + + result = concat([df,empty]) + assert_frame_equal(result, df) + result = concat([empty,df]) + assert_frame_equal(result, df) + + def test_concat_mixed_objs(self): + + # concat mixed series/frames + # G2385 + + # axis 1 + index=date_range('01-Jan-2013', periods=10, freq='H') + arr = np.arange(10, dtype='int64') + s1 = Series(arr, index=index) + s2 = Series(arr, index=index) + df = DataFrame(arr.reshape(-1,1), index=index) + + expected = DataFrame(np.repeat(arr,2).reshape(-1,2), index=index, columns = [0, 0]) + result = concat([df,df], axis=1) + assert_frame_equal(result, expected) + + expected = DataFrame(np.repeat(arr,2).reshape(-1,2), index=index, columns = [0, 1]) + result = concat([s1,s2], axis=1) + assert_frame_equal(result, expected) + + expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = [0, 1, 2]) + result = concat([s1,s2,s1], axis=1) + assert_frame_equal(result, expected) + + expected = DataFrame(np.repeat(arr,5).reshape(-1,5), index=index, columns = [0, 0, 1, 2, 3]) + result = concat([s1,df,s2,s2,s1], axis=1) + assert_frame_equal(result, expected) + + # with names + s1.name = 'foo' + expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = ['foo', 0, 0]) + result = concat([s1,df,s2], axis=1) + assert_frame_equal(result, expected) + + s2.name = 'bar' + expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = ['foo', 0, 'bar']) + result = concat([s1,df,s2], axis=1) + assert_frame_equal(result, expected) + + # ignore index + expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = [0, 1, 2]) + result = concat([s1,df,s2], axis=1, ignore_index=True) + assert_frame_equal(result, expected) + + # axis 0 + expected = DataFrame(np.tile(arr,3).reshape(-1,1), index=index.tolist() * 3, columns = [0]) + result = concat([s1,df,s2]) + assert_frame_equal(result, expected) + + expected = DataFrame(np.tile(arr,3).reshape(-1,1), columns = [0]) + result = concat([s1,df,s2], ignore_index=True) + assert_frame_equal(result, expected) + + # invalid concatente of mixed dims + panel = tm.makePanel() + self.assertRaises(ValueError, lambda : concat([panel,s1],axis=1)) + def test_panel_join(self): panel = tm.makePanel() tm.add_nans(panel) @@ -1967,6 +2038,13 @@ def test_concat_series_axis1_same_names_ignore_index(self): result = concat([s1, s2], axis=1, ignore_index=True) self.assertTrue(np.array_equal(result.columns, [0, 1])) + def test_concat_invalid(self): + + # trying to concat a ndframe with a non-ndframe + df1 = mkdf(10, 2) + for obj in [1, dict(), [1, 2], (1, 2) ]: + self.assertRaises(TypeError, lambda x: concat([ df1, obj ])) + def test_concat_invalid_first_argument(self): df1 = mkdf(10, 2) df2 = mkdf(10, 2) @@ -1975,15 +2053,6 @@ def test_concat_invalid_first_argument(self): # generator ok though concat(DataFrame(np.random.rand(5,5)) for _ in range(3)) - def test_concat_mixed_types_fails(self): - df = DataFrame(randn(10, 1)) - - with tm.assertRaisesRegexp(TypeError, "Cannot concatenate.+"): - concat([df[0], df], axis=1) - - with tm.assertRaisesRegexp(TypeError, "Cannot concatenate.+"): - concat([df, df[0]], axis=1) - class TestOrderedMerge(tm.TestCase): def setUp(self): diff --git a/vb_suite/join_merge.py b/vb_suite/join_merge.py index b60009cd272bb..45f3f510d9f08 100644 --- a/vb_suite/join_merge.py +++ b/vb_suite/join_merge.py @@ -186,6 +186,21 @@ def sample(values, k): concat_small_frames = Benchmark('concat([df] * 1000)', setup, start_date=datetime(2012, 1, 1)) + +#---------------------------------------------------------------------- +# Concat empty + +setup = common_setup + """ +df = DataFrame(dict(A = range(10000)),index=date_range('20130101',periods=10000,freq='s')) +empty = DataFrame() +""" + +concat_empty_frames1 = Benchmark('concat([df,empty])', setup, + start_date=datetime(2012, 1, 1)) +concat_empty_frames2 = Benchmark('concat([empty,df])', setup, + start_date=datetime(2012, 1, 1)) + + #---------------------------------------------------------------------- # Ordered merge