Closed
Description
Code Sample, a copy-pastable example if possible
In [1]: data1 = pd.DataFrame(np.arange(20).reshape((4, 5)) + 1, columns=['a', 'b', 'c', 'd', 'e'])
In [2]: data2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=['a', 'b', 'x', 'y'])
In [3]: import pyarrow as pa
In [4]: d1 = pa.deserialize(pa.serialize(data1).to_buffer())
In [5]: d2 = pa.deserialize(pa.serialize(data2).to_buffer())
In [6]: d1.merge(d2)
Problem description
The above code raises an exception:
In [7]: d1.merge(d2)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-8-f852b96f603a> in <module>
----> 1 d1.merge(d2)
~/pandas/pandas/core/frame.py in merge(self, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
7261 copy=copy,
7262 indicator=indicator,
-> 7263 validate=validate,
7264 )
7265
~/pandas/pandas/core/reshape/merge.py in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
82 validate=validate,
83 )
---> 84 return op.get_result()
85
86
~/pandas/pandas/core/reshape/merge.py in get_result(self)
625 self.left, self.right = self._indicator_pre_merge(self.left, self.right)
626
--> 627 join_index, left_indexer, right_indexer = self._get_join_info()
628
629 ldata, rdata = self.left._data, self.right._data
~/pandas/pandas/core/reshape/merge.py in _get_join_info(self)
842 )
843 else:
--> 844 (left_indexer, right_indexer) = self._get_join_indexers()
845
846 if self.right_index:
~/pandas/pandas/core/reshape/merge.py in _get_join_indexers(self)
821 """ return the join indexers """
822 return _get_join_indexers(
--> 823 self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how
824 )
825
~/pandas/pandas/core/reshape/merge.py in _get_join_indexers(left_keys, right_keys, sort, how, **kwargs)
1285
1286 # get left & right join labels and num. of levels at each location
-> 1287 llab, rlab, shape = map(list, zip(*map(fkeys, left_keys, right_keys)))
1288
1289 # get flat i8 keys from label lists
~/pandas/pandas/core/reshape/merge.py in _factorize_keys(lk, rk, sort)
1882 rizer = klass(max(len(lk), len(rk)))
1883
-> 1884 llab = rizer.factorize(lk)
1885 rlab = rizer.factorize(rk)
1886
~/pandas/pandas/_libs/hashtable.pyx in pandas._libs.hashtable.Int64Factorizer.factorize()
109 return self.count
110
--> 111 def factorize(self, int64_t[:] values, sort=False,
112 na_sentinel=-1, na_value=None):
113 """
~/pandas/pandas/_libs/hashtable.cpython-37m-darwin.so in View.MemoryView.memoryview_cwrapper()
~/pandas/pandas/_libs/hashtable.cpython-37m-darwin.so in View.MemoryView.memoryview.__cinit__()
ValueError: buffer source array is read-only
Expected Output
d1.copy(deep=True).merge(d2.copy(deep=True))
could give the correct result:
In [10]: d1.copy(deep=True).merge(d2.copy(deep=True))
Out[10]:
a b c d e x y
0 1 2 3 4 5 3 4
Output of pd.show_versions()
I'm working with pandas master so show_version()
doesn't work. The git commit hash is a818281a45f7b5bd24f050e5d6868894c5108db6
(the latest version on master branch at 2019-08-16).