Skip to content

Commit 2490949

Browse files
committed
fix hash table ordering, null categories
1 parent 4e0722d commit 2490949

File tree

5 files changed

+52
-21
lines changed

5 files changed

+52
-21
lines changed

asv_bench/benchmarks/parser_vb.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,9 +115,9 @@ def teardown(self):
115115

116116

117117
class read_csv_categorical(object):
118-
def setup(self):
119-
goal_time = 0.2
118+
goal_time = 0.2
120119

120+
def setup(self):
121121
N = 100000
122122
group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee']
123123
df = DataFrame({'a': np.random.choice(group1, N).astype('object'),

pandas/io/tests/parser/c_parser_only.py

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -211,19 +211,49 @@ def test_categorical_dtype(self):
211211
'c': [3.4, 3.4, 4.5]})
212212
tm.assert_frame_equal(actual, expected)
213213

214+
actual = self.read_csv(StringIO(data), dtype={1: 'category'})
215+
tm.assert_frame_equal(actual, expected)
216+
217+
# unsorted
218+
data = """a,b,c
219+
1,b,3.4
220+
1,b,3.4
221+
2,a,4.5"""
222+
expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
223+
'b': Categorical.from_codes([0, 0, 1],
224+
['b', 'a']),
225+
'c': Categorical(['3.4', '3.4', '4.5'])})
226+
actual = self.read_csv(StringIO(data), dtype='category')
227+
tm.assert_frame_equal(actual, expected)
228+
229+
# missing
230+
data = """a,b,c
231+
1,b,3.4
232+
1,nan,3.4
233+
2,a,4.5"""
234+
expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
235+
'b': Categorical.from_codes([0, -1, 1],
236+
['b', 'a']),
237+
'c': Categorical(['3.4', '3.4', '4.5'])})
238+
actual = self.read_csv(StringIO(data), dtype='category')
239+
tm.assert_frame_equal(actual, expected)
240+
214241
def test_categorical_dtype_encoding(self):
215242
# GH 10153
216-
cases = [
217-
('unicode_series.csv', 'latin-1'),
218-
('utf16_ex.txt', 'utf-16')
219-
]
220-
221-
for f, encoding in cases:
222-
pth = tm.get_data_path(f)
223-
expected = self.read_csv(pth, header=None, encoding=encoding)
224-
result = self.read_csv(pth, header=None, encoding=encoding, dtype='category')
225-
result = result.apply(lambda x: x.astype(object))
226-
tm.assert_frame_equal(actual, expected)
243+
pth = tm.get_data_path('unicode_series.csv')
244+
encoding = 'latin-1'
245+
expected = self.read_csv(pth, header=None, encoding=encoding)
246+
actual = self.read_csv(pth, header=None, encoding=encoding,
247+
dtype={1: 'category'})
248+
actual[1] = actual[1].astype(object)
249+
tm.assert_frame_equal(actual, expected)
250+
251+
pth = tm.get_data_path('utf16_ex.txt')
252+
encoding = 'utf-16'
253+
expected = self.read_table(pth, encoding=encoding)
254+
actual = self.read_table(pth, encoding=encoding, dtype='category')
255+
actual = actual.apply(lambda x: x.astype(object))
256+
tm.assert_frame_equal(actual, expected)
227257

228258
def test_pass_dtype_as_recarray(self):
229259
if compat.is_platform_windows() and self.low_memory:

pandas/parser.pyx

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1527,8 +1527,9 @@ cdef _categorical_convert(parser_t *parser, int col,
15271527

15281528
lines = line_end - line_start
15291529
codes = np.empty(lines, dtype=np.int64)
1530+
15301531
# factorize parsed values, creating a hash table
1531-
# bytes -> category
1532+
# bytes -> category code
15321533
with nogil:
15331534
table = kh_init_str()
15341535
coliter_setup(&it, parser, col, line_start)
@@ -1554,24 +1555,20 @@ cdef _categorical_convert(parser_t *parser, int col,
15541555
codes[i] = table.vals[k]
15551556

15561557
# parse and box categories to python strings
1557-
i = 0
15581558
result = np.empty(table.n_occupied, dtype=np.object_)
15591559
if path == ENCODED:
15601560
for k in range(table.n_buckets):
15611561
if kh_exist_str(table, k):
15621562
size = strlen(table.keys[k])
1563-
result[i] = PyUnicode_Decode(table.keys[k], size, encoding, errors)
1564-
i += 1
1563+
result[table.vals[k]] = PyUnicode_Decode(table.keys[k], size, encoding, errors)
15651564
elif path == UTF8:
15661565
for k in range(table.n_buckets):
15671566
if kh_exist_str(table, k):
1568-
result[i] = PyUnicode_FromString(table.keys[k])
1569-
i += 1
1567+
result[table.vals[k]] = PyUnicode_FromString(table.keys[k])
15701568
elif path == CSTRING:
15711569
for k in range(table.n_buckets):
15721570
if kh_exist_str(table, k):
1573-
result[i] = PyBytes_FromString(table.keys[k])
1574-
i += 1
1571+
result[table.vals[k]] = PyBytes_FromString(table.keys[k])
15751572

15761573
kh_destroy_str(table)
15771574
return np.asarray(codes), result, na_count

pandas/tools/tests/test_concat.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -850,6 +850,9 @@ def test_union_categorical(self):
850850
([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
851851
([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),
852852

853+
(['b', 'b', np.nan, 'a'], ['a', np.nan, 'c'],
854+
['b', 'b', np.nan, 'a', 'a', np.nan, 'c']),
855+
853856
(pd.date_range('2014-01-01', '2014-01-05'),
854857
pd.date_range('2014-01-06', '2014-01-07'),
855858
pd.date_range('2014-01-01', '2014-01-07')),

pandas/types/concat.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ def union_categoricals(to_union, sort_categories=False):
240240
Emmpty list of categoricals passed
241241
"""
242242
from pandas import Index, Categorical
243+
from pandas.core.algorithms import take_1d
243244

244245
if len(to_union) == 0:
245246
raise ValueError('No Categoricals to union')

0 commit comments

Comments
 (0)