Skip to content

Commit 555fb91

Browse files
committed
First pass at fixing issues with SparseDataFrame merging
1 parent ca4ae4f commit 555fb91

File tree

6 files changed

+69
-11
lines changed

6 files changed

+69
-11
lines changed

doc/source/whatsnew/v0.23.0.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -538,7 +538,7 @@ Reshaping
538538
- Bug in :func:`DataFrame.merge` in which merging using ``Index`` objects as vectors raised an Exception (:issue:`19038`)
539539
- Bug in :func:`DataFrame.stack`, :func:`DataFrame.unstack`, :func:`Series.unstack` which were not returning subclasses (:issue:`15563`)
540540
- Bug in timezone comparisons, manifesting as a conversion of the index to UTC in ``.concat()`` (:issue:`18523`)
541-
-
541+
- Bug in :func:`SparseDataFrame.merge` which raises error (:issue:`13665`)
542542

543543

544544
Categorical

pandas/core/internals.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2918,14 +2918,15 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None,
29182918
# GH#19265 pyarrow is passing this
29192919
warnings.warn("fastpath argument is deprecated, will be removed "
29202920
"in a future release.", DeprecationWarning)
2921+
29212922
if klass is None:
29222923
dtype = dtype or values.dtype
29232924
klass = get_block_type(values, dtype)
29242925

29252926
elif klass is DatetimeTZBlock and not is_datetimetz(values):
29262927
return klass(values, ndim=ndim,
29272928
placement=placement, dtype=dtype)
2928-
2929+
29292930
return klass(values, ndim=ndim, placement=placement)
29302931

29312932
# TODO: flexible with index=None and/or items=None
@@ -5120,14 +5121,28 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
51205121
elif is_uniform_join_units(join_units):
51215122
b = join_units[0].block.concat_same_type(
51225123
[ju.block for ju in join_units], placement=placement)
5124+
elif is_sparse_join_units(join_units):
5125+
values = concatenate_join_units(join_units, concat_axis, copy=copy)
5126+
values = values[0]
5127+
block = join_units[0].block
5128+
5129+
if block:
5130+
fill_value = block.fill_value
5131+
else:
5132+
fill_value = np.nan
5133+
array = SparseArray(values, fill_value=fill_value)
5134+
b = make_block(array, klass=SparseBlock, placement=placement)
51235135
else:
51245136
b = make_block(
51255137
concatenate_join_units(join_units, concat_axis, copy=copy),
5126-
placement=placement)
5138+
placement=placement
5139+
)
51275140
blocks.append(b)
51285141

51295142
return BlockManager(blocks, axes)
51305143

5144+
def is_sparse_join_units(join_units):
5145+
return any(type(ju.block) is SparseBlock for ju in join_units)
51315146

51325147
def is_uniform_join_units(join_units):
51335148
"""

pandas/core/reshape/merge.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@
3838
concatenate_block_managers)
3939
from pandas.util._decorators import Appender, Substitution
4040

41+
from pandas.core.sparse.array import SparseArray
42+
4143
from pandas.core.sorting import is_int64_overflow_possible
4244
import pandas.core.algorithms as algos
4345
import pandas.core.sorting as sorting
@@ -731,7 +733,11 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
731733
if mask.all():
732734
key_col = rvals
733735
else:
734-
key_col = Index(lvals).where(~mask, rvals)
736+
# Might need to be IntIndex not Index
737+
if isinstance(lvals, SparseArray):
738+
key_col = Index(lvals.get_values()).where(~mask, rvals)
739+
else:
740+
key_col = Index(lvals).where(~mask, rvals)
735741

736742
if result._is_label_reference(name):
737743
result[name] = key_col

pandas/core/sparse/frame.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
import pandas.core.ops as ops
2929
import pandas.core.common as com
3030

31+
from collections import Counter
32+
3133
_shared_doc_kwargs = dict(klass='SparseDataFrame')
3234

3335

@@ -73,6 +75,9 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None,
7375
if columns is None:
7476
raise Exception("cannot pass a series w/o a name or columns")
7577
data = {columns[0]: data}
78+
elif isinstance(data, BlockManager):
79+
if default_fill_value is None:
80+
default_fill_value, _ = Counter([b.fill_value for b in data.blocks]).most_common(1)[0]
7681

7782
if default_fill_value is None:
7883
default_fill_value = np.nan

pandas/tests/reshape/merge/test_merge.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import numpy as np
88
import random
99
import re
10+
import itertools
1011

1112
import pandas as pd
1213
from pandas.compat import lrange, lzip
@@ -1800,3 +1801,31 @@ def test_merge_on_indexes(self, left_df, right_df, how, sort, expected):
18001801
how=how,
18011802
sort=sort)
18021803
tm.assert_frame_equal(result, expected)
1804+
1805+
class TestMergeSparseDataFrames(object):
1806+
# Cannot seem to get 0 or 1 working with sparse data frame
1807+
@pytest.mark.parametrize('fill_value,how', itertools.product([np.nan], ['left', 'right', 'outer', 'inner']))
1808+
def test_merge_two_sparse_frames(self, fill_value, how):
1809+
dense_evens = pd.DataFrame({'A': list(range(0, 200, 2)), 'B': np.random.randint(0,100, size=100)})
1810+
dense_threes = pd.DataFrame({'A': list(range(0, 300, 3)), 'B': np.random.randint(0,100, size=100)})
1811+
1812+
dense_merge = dense_evens.merge(dense_threes, how=how, on='A')
1813+
1814+
# If you merge two dense frames together it tends to default to float64 not the original dtype
1815+
dense_merge['B_x'] = dense_merge['B_x'].astype(np.int64, errors='ignore')
1816+
dense_merge['B_y'] = dense_merge['B_y'].astype(np.int64, errors='ignore')
1817+
1818+
sparse_evens = dense_evens.to_sparse(fill_value=fill_value)
1819+
sparse_threes = dense_threes.to_sparse(fill_value=fill_value)
1820+
1821+
sparse_merge = sparse_evens.merge(sparse_threes, how=how, on='A')
1822+
1823+
assert sparse_merge.default_fill_value is fill_value
1824+
1825+
tm.assert_sp_frame_equal(dense_merge.to_sparse(fill_value=fill_value), sparse_merge, exact_indices=False, check_dtype=False)
1826+
1827+
1828+
@pytest.mark.parametrize('fill_value,how', itertools.product([0, 1, np.nan, None], ['left', 'right', 'outer', 'inner']))
1829+
def test_merge_dense_sparse_frames(self, fill_value, how):
1830+
"pass"
1831+

pandas/tests/sparse/frame/test_frame.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -222,27 +222,30 @@ class Unknown:
222222
'"Unknown" for data argument'):
223223
SparseDataFrame(Unknown())
224224

225-
def test_constructor_preserve_attr(self):
225+
@pytest.mark.parametrize('fill_value', [0, 1, np.nan, None])
226+
def test_constructor_preserve_attr(self, fill_value):
226227
# GH 13866
227-
arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=0)
228+
arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=fill_value)
228229
assert arr.dtype == np.int64
229-
assert arr.fill_value == 0
230+
assert arr.fill_value == fill_value
230231

231232
df = pd.SparseDataFrame({'x': arr})
232233
assert df['x'].dtype == np.int64
233-
assert df['x'].fill_value == 0
234+
assert df['x'].fill_value == fill_value
235+
assert df.default_fill_value == fill_value
234236

235237
s = pd.SparseSeries(arr, name='x')
236238
assert s.dtype == np.int64
237-
assert s.fill_value == 0
239+
assert s.fill_value == fill_value
238240

239241
df = pd.SparseDataFrame(s)
240242
assert df['x'].dtype == np.int64
241-
assert df['x'].fill_value == 0
243+
assert df['x'].fill_value == fill_value
242244

243245
df = pd.SparseDataFrame({'x': s})
244246
assert df['x'].dtype == np.int64
245-
assert df['x'].fill_value == 0
247+
assert df['x'].fill_value == fill_value
248+
246249

247250
def test_constructor_nan_dataframe(self):
248251
# GH 10079

0 commit comments

Comments
 (0)