First pass at fixing issues with SparseDataFrame merging

hexgnu · hexgnu · commit 555fb91911c7 · 2018-02-01T15:26:30.000+07:00
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -538,7 +538,7 @@ Reshaping
 - Bug in :func:`DataFrame.merge` in which merging using ``Index`` objects as vectors raised an Exception (:issue:`19038`)
 - Bug in :func:`DataFrame.stack`, :func:`DataFrame.unstack`, :func:`Series.unstack` which were not returning subclasses (:issue:`15563`)
 - Bug in timezone comparisons, manifesting as a conversion of the index to UTC in ``.concat()`` (:issue:`18523`)
--
+- Bug in :func:`SparseDataFrame.merge` which raises error (:issue:`13665`)
 
 
 Categorical
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -2918,14 +2918,15 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None,
         # GH#19265 pyarrow is passing this
         warnings.warn("fastpath argument is deprecated, will be removed "
                       "in a future release.", DeprecationWarning)
+
     if klass is None:
         dtype = dtype or values.dtype
         klass = get_block_type(values, dtype)
 
     elif klass is DatetimeTZBlock and not is_datetimetz(values):
         return klass(values, ndim=ndim,
                      placement=placement, dtype=dtype)
-
+    
     return klass(values, ndim=ndim, placement=placement)
 
 # TODO: flexible with index=None and/or items=None
@@ -5120,14 +5121,28 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
         elif is_uniform_join_units(join_units):
             b = join_units[0].block.concat_same_type(
                 [ju.block for ju in join_units], placement=placement)
+        elif is_sparse_join_units(join_units):
+            values = concatenate_join_units(join_units, concat_axis, copy=copy)
+            values = values[0]
+            block = join_units[0].block
+
+            if block:
+                fill_value = block.fill_value
+            else:
+                fill_value = np.nan
+            array = SparseArray(values, fill_value=fill_value)
+            b = make_block(array, klass=SparseBlock, placement=placement)
         else:
             b = make_block(
                 concatenate_join_units(join_units, concat_axis, copy=copy),
-                placement=placement)
+                placement=placement
+                )
         blocks.append(b)
 
     return BlockManager(blocks, axes)
 
+def is_sparse_join_units(join_units):
+    return any(type(ju.block) is SparseBlock for ju in join_units)
 
 def is_uniform_join_units(join_units):
     """
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -38,6 +38,8 @@
                                    concatenate_block_managers)
 from pandas.util._decorators import Appender, Substitution
 
+from pandas.core.sparse.array import SparseArray
+
 from pandas.core.sorting import is_int64_overflow_possible
 import pandas.core.algorithms as algos
 import pandas.core.sorting as sorting
@@ -731,7 +733,11 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
                 if mask.all():
                     key_col = rvals
                 else:
-                    key_col = Index(lvals).where(~mask, rvals)
+                    # Might need to be IntIndex not Index
+                    if isinstance(lvals, SparseArray):
+                        key_col = Index(lvals.get_values()).where(~mask, rvals)
+                    else:
+                        key_col = Index(lvals).where(~mask, rvals)
 
                 if result._is_label_reference(name):
                     result[name] = key_col
diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py
@@ -28,6 +28,8 @@
 import pandas.core.ops as ops
 import pandas.core.common as com
 
+from collections import Counter
+
 _shared_doc_kwargs = dict(klass='SparseDataFrame')
 
 
@@ -73,6 +75,9 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None,
             if columns is None:
                 raise Exception("cannot pass a series w/o a name or columns")
             data = {columns[0]: data}
+        elif isinstance(data, BlockManager):
+            if default_fill_value is None:
+                default_fill_value, _ = Counter([b.fill_value for b in data.blocks]).most_common(1)[0]
 
         if default_fill_value is None:
             default_fill_value = np.nan
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
@@ -7,6 +7,7 @@
 import numpy as np
 import random
 import re
+import itertools
 
 import pandas as pd
 from pandas.compat import lrange, lzip
@@ -1800,3 +1801,31 @@ def test_merge_on_indexes(self, left_df, right_df, how, sort, expected):
                           how=how,
                           sort=sort)
         tm.assert_frame_equal(result, expected)
+
+class TestMergeSparseDataFrames(object):
+    # Cannot seem to get 0 or 1 working with sparse data frame
+    @pytest.mark.parametrize('fill_value,how', itertools.product([np.nan], ['left', 'right', 'outer', 'inner']))
+    def test_merge_two_sparse_frames(self, fill_value, how):
+        dense_evens = pd.DataFrame({'A': list(range(0, 200, 2)), 'B': np.random.randint(0,100, size=100)})
+        dense_threes = pd.DataFrame({'A': list(range(0, 300, 3)), 'B': np.random.randint(0,100, size=100)})
+
+        dense_merge = dense_evens.merge(dense_threes, how=how, on='A')
+
+        # If you merge two dense frames together it tends to default to float64 not the original dtype
+        dense_merge['B_x'] = dense_merge['B_x'].astype(np.int64, errors='ignore')
+        dense_merge['B_y'] = dense_merge['B_y'].astype(np.int64, errors='ignore')
+
+        sparse_evens = dense_evens.to_sparse(fill_value=fill_value)
+        sparse_threes = dense_threes.to_sparse(fill_value=fill_value)
+
+        sparse_merge = sparse_evens.merge(sparse_threes, how=how, on='A')
+
+        assert sparse_merge.default_fill_value is fill_value
+
+        tm.assert_sp_frame_equal(dense_merge.to_sparse(fill_value=fill_value), sparse_merge, exact_indices=False, check_dtype=False)
+
+
+    @pytest.mark.parametrize('fill_value,how', itertools.product([0, 1, np.nan, None], ['left', 'right', 'outer', 'inner']))
+    def test_merge_dense_sparse_frames(self, fill_value, how):
+        "pass"
+
diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py
@@ -222,27 +222,30 @@ class Unknown:
                                    '"Unknown" for data argument'):
             SparseDataFrame(Unknown())
 
-    def test_constructor_preserve_attr(self):
+    @pytest.mark.parametrize('fill_value', [0, 1, np.nan, None])
+    def test_constructor_preserve_attr(self, fill_value):
         # GH 13866
-        arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=0)
+        arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=fill_value)
         assert arr.dtype == np.int64
-        assert arr.fill_value == 0
+        assert arr.fill_value == fill_value
 
         df = pd.SparseDataFrame({'x': arr})
         assert df['x'].dtype == np.int64
-        assert df['x'].fill_value == 0
+        assert df['x'].fill_value == fill_value
+        assert df.default_fill_value == fill_value
 
         s = pd.SparseSeries(arr, name='x')
         assert s.dtype == np.int64
-        assert s.fill_value == 0
+        assert s.fill_value == fill_value
 
         df = pd.SparseDataFrame(s)
         assert df['x'].dtype == np.int64
-        assert df['x'].fill_value == 0
+        assert df['x'].fill_value == fill_value
 
         df = pd.SparseDataFrame({'x': s})
         assert df['x'].dtype == np.int64
-        assert df['x'].fill_value == 0
+        assert df['x'].fill_value == fill_value
+
 
     def test_constructor_nan_dataframe(self):
         # GH 10079