Closed
Description
It seems that there is something bad happening when we use certain strings with special characters AND the empty string with categoricals:
# -*- coding: latin-1 -*-
import pandas
import os
examples = [
pandas.Series(['EÉ, 17', '', 'a', 'b', 'c'], dtype='category'),
pandas.Series(['EÉ, 17', 'a', 'b', 'c'], dtype='category'),
pandas.Series(['', 'a', 'b', 'c'], dtype='category'),
pandas.Series(['EE, 17', '', 'a', 'b', 'c'], dtype='category'),
pandas.Series(['øü', 'a', 'b', 'c'], dtype='category'),
pandas.Series(['Aøü', '', 'a', 'b', 'c'], dtype='category'),
pandas.Series(['EÉ, 17', 'øü', 'a', 'b', 'c'], dtype='category')
]
def test_hdf(s):
f = 'testhdf.h5'
if os.path.exists(f):
os.remove(f)
s.to_hdf(f, 'data', format='table')
return pandas.read_hdf(f, 'data')
for i, s in enumerate(examples):
flag = True
e = ''
try:
test_hdf(s)
except Exception as ex:
e = ex
flag = False
print('%d: %s\t%s\t%s' % (i, 'pass' if flag else 'fail', s.tolist(), e))
Results in:
0: fail ['EÉ, 17', '', 'a', 'b', 'c'] Categorical categories must be unique
1: pass ['EÉ, 17', 'a', 'b', 'c']
2: pass ['', 'a', 'b', 'c']
3: pass ['EE, 17', '', 'a', 'b', 'c']
4: pass ['øü', 'a', 'b', 'c']
5: fail ['Aøü', '', 'a', 'b', 'c'] Categorical categories must be unique
6: pass ['EÉ, 17', 'øü', 'a', 'b', 'c']
Not sure if I am using this incorrectly or if this is actually a corner case.