Skip to content

BUG: concat(Series[sparse]) raises ValueError #23557

Closed
@TomAugspurger

Description

@TomAugspurger
In [1]: import pandas as pd

In [2]: pd.__version__
Out[2]: '0.23.4'

In [3]: a = pd.Series(pd.SparseArray([0, 1, 2]))

In [4]: pd.concat([a, a], axis=1)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-4-a3e65bc6fb67> in <module>()
----> 1 pd.concat([a, a], axis=1)

~/Envs/dask-dev/lib/python3.7/site-packages/pandas/core/reshape/concat.py in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, sort, copy)
    224                        verify_integrity=verify_integrity,
    225                        copy=copy, sort=sort)
--> 226     return op.get_result()
    227
    228

~/Envs/dask-dev/lib/python3.7/site-packages/pandas/core/reshape/concat.py in get_result(self)
    398
    399                 index, columns = self.new_axes
--> 400                 df = cons(data, index=index)
    401                 df.columns = columns
    402                 return df.__finalize__(self, method='concat')

~/Envs/dask-dev/lib/python3.7/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
    346                                  dtype=dtype, copy=copy)
    347         elif isinstance(data, dict):
--> 348             mgr = self._init_dict(data, index, columns, dtype=dtype)
    349         elif isinstance(data, ma.MaskedArray):
    350             import numpy.ma.mrecords as mrecords

~/Envs/dask-dev/lib/python3.7/site-packages/pandas/core/frame.py in _init_dict(self, data, index, columns, dtype)
    457             arrays = [data[k] for k in keys]
    458
--> 459         return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
    460
    461     def _init_ndarray(self, values, index, columns, dtype=None, copy=False):

~/Envs/dask-dev/lib/python3.7/site-packages/pandas/core/frame.py in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
   7362     axes = [_ensure_index(columns), _ensure_index(index)]
   7363
-> 7364     return create_block_manager_from_arrays(arrays, arr_names, axes)
   7365
   7366

~/Envs/dask-dev/lib/python3.7/site-packages/pandas/core/internals.py in create_block_manager_from_arrays(arrays, names, axes)
   4875         return mgr
   4876     except ValueError as e:
-> 4877         construction_error(len(arrays), arrays[0].shape, axes, e)
   4878
   4879

~/Envs/dask-dev/lib/python3.7/site-packages/pandas/core/internals.py in construction_error(tot_items, block_shape, axes, e)
   4837     implied = tuple(map(int, [len(ax) for ax in axes]))
   4838     if passed == implied and e is not None:
-> 4839         raise e
   4840     if block_shape[0] == 0:
   4841         raise ValueError("Empty data passed with indices specified.")

~/Envs/dask-dev/lib/python3.7/site-packages/pandas/core/internals.py in create_block_manager_from_arrays(arrays, names, axes)
   4870
   4871     try:
-> 4872         blocks = form_blocks(arrays, names, axes)
   4873         mgr = BlockManager(blocks, axes)
   4874         mgr._consolidate_inplace()

~/Envs/dask-dev/lib/python3.7/site-packages/pandas/core/internals.py in form_blocks(arrays, names, axes)
   4916
   4917     if len(items_dict['IntBlock']):
-> 4918         int_blocks = _multi_blockify(items_dict['IntBlock'])
   4919         blocks.extend(int_blocks)
   4920

~/Envs/dask-dev/lib/python3.7/site-packages/pandas/core/internals.py in _multi_blockify(tuples, dtype)
   4993     for dtype, tup_block in grouper:
   4994
-> 4995         values, placement = _stack_arrays(list(tup_block), dtype)
   4996
   4997         block = make_block(values, placement=placement)

~/Envs/dask-dev/lib/python3.7/site-packages/pandas/core/internals.py in _stack_arrays(tuples, dtype)
   5037     stacked = np.empty(shape, dtype=dtype)
   5038     for i, arr in enumerate(arrays):
-> 5039         stacked[i] = _asarray_compat(arr)
   5040
   5041     return stacked, placement

ValueError: could not broadcast input array from shape (2) into shape (3)

Right now on master, we return a SparseDataFrame. I would like to instead return a DataFrame with sparse values. On master, the rule for the result type is currently "sparse if any of the inputs are sparse". I would amend that to specifically be "sparse if any of the inputs are a SparseDataFrame or SparseSeries". This will require breaking a couple places like SparseSeries.unstack(), unless we hack in some special cases for sparse, which I'd like to avoid.

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions