Skip to content

hash_pandas_object fails on tuple #28969

Closed
@mgsnuno

Description

@mgsnuno

Code Sample

import pandas as pd

pd.util.hash_pandas_object(pd.DataFrame({'data': [tuple('1'), tuple('2')]})) # fails
pd.util.hash_pandas_object(pd.DataFrame({'data': [tuple([1]), tuple([2])]})) # fails
Traceback

TypeError                                 Traceback (most recent call last)
~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in hash_array(vals, encoding, hash_key, categorize)
    307         try:
--> 308             vals = hashing.hash_object_array(vals, hash_key, encoding)
    309         except TypeError:

pandas/_libs/hashing.pyx in pandas._libs.hashing.hash_object_array()

TypeError: ('1',) of type  is not a valid type for hashing, must be string or null

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
 in 
      1 import pandas as pd
      2 
----> 3 pd.util.hash_pandas_object(pd.DataFrame({'data': [tuple('1'), tuple('2')]}))

~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in hash_pandas_object(obj, index, encoding, hash_key, categorize)
    129             num_items += 1
    130             hashes = itertools.chain(hashes, index_hash_generator)
--> 131         h = _combine_hash_arrays(hashes, num_items)
    132 
    133         h = Series(h, index=obj.index, dtype="uint64", copy=False)

~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in _combine_hash_arrays(arrays, num_items)
     37     """
     38     try:
---> 39         first = next(arrays)
     40     except StopIteration:
     41         return np.array([], dtype=np.uint64)

~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in (.0)
    114 
    115     elif isinstance(obj, ABCDataFrame):
--> 116         hashes = (hash_array(series.values) for _, series in obj.items())
    117         num_items = len(obj.columns)
    118         if index:

~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in hash_array(vals, encoding, hash_key, categorize)
    303             codes, categories = factorize(vals, sort=False)
    304             cat = Categorical(codes, Index(categories), ordered=False, fastpath=True)
--> 305             return _hash_categorical(cat, encoding, hash_key)
    306 
    307         try:

~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in _hash_categorical(c, encoding, hash_key)
    221     # Convert ExtensionArrays to ndarrays
    222     values = np.asarray(c.categories.values)
--> 223     hashed = hash_array(values, encoding, hash_key, categorize=False)
    224 
    225     # we have uint64, as we don't directly support missing values

~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in hash_array(vals, encoding, hash_key, categorize)
    310             # we have mixed types
    311             vals = hashing.hash_object_array(
--> 312                 vals.astype(str).astype(object), hash_key, encoding
    313             )
    314 

ValueError: setting an array element with a sequence

Problem description

Tuples are immutable and hash should work correct?

Expected Output

Hashed dataframe elements.

Output of pd.show_versions()

INSTALLED VERSIONS

commit : None
python : 3.7.3.final.0
python-bits : 64
OS : Linux
OS-release : 5.0.0-1018-azure
machine : x86_64
processor : x86_64
byteorder : little
LC_ALL : None
LANG : C.UTF-8
LOCALE : en_US.UTF-8

pandas : 0.25.1
numpy : 1.17.2
pytz : 2019.3
dateutil : 2.8.0
pip : 19.2.3
setuptools : 41.4.0
Cython : None
pytest : 5.2.1
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 2.10.3
IPython : 7.8.0
pandas_datareader: None
bs4 : None
bottleneck : None
fastparquet : None
gcsfs : None
lxml.etree : None
matplotlib : 3.1.1
numexpr : None
odfpy : None
openpyxl : 3.0.0
pandas_gbq : None
pyarrow : 0.15.0
pytables : None
s3fs : None
scipy : 1.3.1
sqlalchemy : 1.3.10
tables : None
xarray : None
xlrd : 1.2.0
xlwt : None
xlsxwriter : None

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions