Skip to content

df.reset_index() fails if df.index.name is a tuple #16164

Closed
@toobaz

Description

@toobaz

Code Sample, a copy-pastable example if possible

In [2]: df = pd.DataFrame([[1, 2]], columns=pd.MultiIndex.from_product([['A'], ['a', 'b']]))

In [3]: reind = df.set_index([('A', 'a')])

In [4]: reind.reset_index()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/home/pietro/nobackup/repo/pandas/pandas/util/hashing.py in hash_array(vals, encoding, hash_key, categorize)
    265         try:
--> 266             vals = _hash.hash_object_array(vals, hash_key, encoding)
    267         except TypeError:

as/pandas/util/hashing.pyx in pandas.util.libhashing.hash_object_array (pandas/util/hashing.c:2372)()

TypeError: ('A', 'a') of type <class 'tuple'> is not a valid type for hashing, must be string or null

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-4-985862d3b27d> in <module>()
----> 1 reind.reset_index()

/home/pietro/nobackup/repo/pandas/pandas/core/frame.py in reset_index(self, level, drop, inplace, col_level, col_fill)
   3057                     name = tuple(name_lst)
   3058             values = _maybe_casted_values(self.index)
-> 3059             new_obj.insert(0, name, values)
   3060 
   3061         new_obj.index = new_index

/home/pietro/nobackup/repo/pandas/pandas/core/frame.py in insert(self, loc, column, value, allow_duplicates)
   2518         value = self._sanitize_column(column, value, broadcast=False)
   2519         self._data.insert(loc, column, value,
-> 2520                           allow_duplicates=allow_duplicates)
   2521 
   2522     def assign(self, **kwargs):

/home/pietro/nobackup/repo/pandas/pandas/core/internals.py in insert(self, loc, item, value, allow_duplicates)
   3806 
   3807         """
-> 3808         if not allow_duplicates and item in self.items:
   3809             # Should this be a different kind of error??
   3810             raise ValueError('cannot insert {}, already exists'.format(item))

/home/pietro/nobackup/repo/pandas/pandas/core/indexes/multi.py in __contains__(self, key)
   1326         hash(key)
   1327         try:
-> 1328             self.get_loc(key)
   1329             return True
   1330         except LookupError:

/home/pietro/nobackup/repo/pandas/pandas/core/indexes/multi.py in get_loc(self, key, method)
   1984             key = _values_from_object(key)
   1985             key = tuple(map(_maybe_str_to_time_stamp, key, self.levels))
-> 1986             return self._engine.get_loc(key)
   1987 
   1988         # -- partial selection or non-unique index

/home/pietro/nobackup/repo/pandas/pandas/_libs/index.pyx in pandas._libs.index.MultiIndexEngine.get_loc (pandas/_libs/index.c:13171)()

/home/pietro/nobackup/repo/pandas/pandas/_libs/index.pyx in pandas._libs.index.MultiIndexEngine.get_loc (pandas/_libs/index.c:13018)()

/home/pietro/nobackup/repo/pandas/pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.MultiIndexHashTable.get_item (pandas/_libs/hashtable.c:23625)()

/home/pietro/nobackup/repo/pandas/pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.MultiIndexHashTable.get_item (pandas/_libs/hashtable.c:23374)()

/home/pietro/nobackup/repo/pandas/pandas/core/indexes/multi.py in _hashed_indexing_key(self, key)
    755         key = tuple([f(k, stringify)
    756                      for k, stringify in zip(key, self._have_mixed_levels)])
--> 757         return hash_tuples(key)
    758 
    759     @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs)

/home/pietro/nobackup/repo/pandas/pandas/util/hashing.py in hash_tuples(vals, encoding, hash_key)
    159                                 hash_key=hash_key)
    160               for cat in vals)
--> 161     h = _combine_hash_arrays(hashes, len(vals))
    162     if is_tuple:
    163         h = h[0]

/home/pietro/nobackup/repo/pandas/pandas/util/hashing.py in _combine_hash_arrays(arrays, num_items)
     31     """
     32     try:
---> 33         first = next(arrays)
     34     except StopIteration:
     35         return np.array([], dtype=np.uint64)

/home/pietro/nobackup/repo/pandas/pandas/util/hashing.py in <genexpr>(.0)
    158                                 encoding=encoding,
    159                                 hash_key=hash_key)
--> 160               for cat in vals)
    161     h = _combine_hash_arrays(hashes, len(vals))
    162     if is_tuple:

/home/pietro/nobackup/repo/pandas/pandas/util/hashing.py in _hash_categorical(c, encoding, hash_key)
    182     """
    183     hashed = hash_array(c.categories.values, encoding, hash_key,
--> 184                         categorize=False)
    185 
    186     # we have uint64, as we don't directly support missing values

/home/pietro/nobackup/repo/pandas/pandas/util/hashing.py in hash_array(vals, encoding, hash_key, categorize)
    267         except TypeError:
    268             # we have mixed types
--> 269             vals = _hash.hash_object_array(vals.astype(str).astype(object),
    270                                            hash_key, encoding)
    271 

ValueError: setting an array element with a sequence

Problem description

In general, one expects df.set_index([df.columns[0]]).reset_index() to return (a copy of) df. Instead, this does not happen if df.columns is a MultiIndex.

@jreback I know you were not really convinced this should work, but I'm uploading a PR in a couple of minutes, and as you will see the fix is extremely simple.

Expected Output

df

Output of pd.show_versions()

INSTALLED VERSIONS

commit: None
python: 3.5.3.final.0
python-bits: 64
OS: Linux
OS-release: 4.7.0-1-amd64
machine: x86_64
processor:
byteorder: little
LC_ALL: None
LANG: it_IT.utf8
LOCALE: it_IT.UTF-8

pandas: 0.20.0rc1+29.g075eca1fa
pytest: 3.0.6
pip: 9.0.1
setuptools: 33.1.1
Cython: 0.25.2
numpy: 1.12.0
scipy: 0.18.1
xarray: 0.9.1
IPython: 5.1.0.dev
sphinx: 1.4.9
patsy: 0.3.0-dev
dateutil: 2.5.3
pytz: 2016.7
blosc: None
bottleneck: 1.2.0
tables: 3.3.0
numexpr: 2.6.1
feather: 0.3.1
matplotlib: 2.0.0
openpyxl: 2.3.0
xlrd: 1.0.0
xlwt: 1.1.2
xlsxwriter: 0.9.6
lxml: 3.7.1
bs4: 4.5.3
html5lib: 0.999999999
sqlalchemy: 1.0.15
pymysql: None
psycopg2: None
jinja2: 2.8
s3fs: None
pandas_gbq: None
pandas_datareader: 0.2.1

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions