Description
Code Sample
import pandas as pd
pd.util.hash_pandas_object(pd.DataFrame({'data': [tuple('1'), tuple('2')]})) # fails
pd.util.hash_pandas_object(pd.DataFrame({'data': [tuple([1]), tuple([2])]})) # fails
Traceback
TypeError Traceback (most recent call last)
~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in hash_array(vals, encoding, hash_key, categorize)
307 try:
--> 308 vals = hashing.hash_object_array(vals, hash_key, encoding)
309 except TypeError:
pandas/_libs/hashing.pyx in pandas._libs.hashing.hash_object_array()
TypeError: ('1',) of type is not a valid type for hashing, must be string or null
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
in
1 import pandas as pd
2
----> 3 pd.util.hash_pandas_object(pd.DataFrame({'data': [tuple('1'), tuple('2')]}))
~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in hash_pandas_object(obj, index, encoding, hash_key, categorize)
129 num_items += 1
130 hashes = itertools.chain(hashes, index_hash_generator)
--> 131 h = _combine_hash_arrays(hashes, num_items)
132
133 h = Series(h, index=obj.index, dtype="uint64", copy=False)
~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in _combine_hash_arrays(arrays, num_items)
37 """
38 try:
---> 39 first = next(arrays)
40 except StopIteration:
41 return np.array([], dtype=np.uint64)
~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in (.0)
114
115 elif isinstance(obj, ABCDataFrame):
--> 116 hashes = (hash_array(series.values) for _, series in obj.items())
117 num_items = len(obj.columns)
118 if index:
~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in hash_array(vals, encoding, hash_key, categorize)
303 codes, categories = factorize(vals, sort=False)
304 cat = Categorical(codes, Index(categories), ordered=False, fastpath=True)
--> 305 return _hash_categorical(cat, encoding, hash_key)
306
307 try:
~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in _hash_categorical(c, encoding, hash_key)
221 # Convert ExtensionArrays to ndarrays
222 values = np.asarray(c.categories.values)
--> 223 hashed = hash_array(values, encoding, hash_key, categorize=False)
224
225 # we have uint64, as we don't directly support missing values
~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in hash_array(vals, encoding, hash_key, categorize)
310 # we have mixed types
311 vals = hashing.hash_object_array(
--> 312 vals.astype(str).astype(object), hash_key, encoding
313 )
314
ValueError: setting an array element with a sequence
Problem description
Tuples are immutable and hash should work correct?
Expected Output
Hashed dataframe elements.
Output of pd.show_versions()
INSTALLED VERSIONS
commit : None
python : 3.7.3.final.0
python-bits : 64
OS : Linux
OS-release : 5.0.0-1018-azure
machine : x86_64
processor : x86_64
byteorder : little
LC_ALL : None
LANG : C.UTF-8
LOCALE : en_US.UTF-8
pandas : 0.25.1
numpy : 1.17.2
pytz : 2019.3
dateutil : 2.8.0
pip : 19.2.3
setuptools : 41.4.0
Cython : None
pytest : 5.2.1
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 2.10.3
IPython : 7.8.0
pandas_datareader: None
bs4 : None
bottleneck : None
fastparquet : None
gcsfs : None
lxml.etree : None
matplotlib : 3.1.1
numexpr : None
odfpy : None
openpyxl : 3.0.0
pandas_gbq : None
pyarrow : 0.15.0
pytables : None
s3fs : None
scipy : 1.3.1
sqlalchemy : 1.3.10
tables : None
xarray : None
xlrd : 1.2.0
xlwt : None
xlsxwriter : None