fix hash table ordering, null categories

chris-b1 · chris-b1 · commit 249094918be2 · 2016-08-04T18:09:18.000-05:00
diff --git a/asv_bench/benchmarks/parser_vb.py b/asv_bench/benchmarks/parser_vb.py
@@ -115,9 +115,9 @@ def teardown(self):
 
 
 class read_csv_categorical(object):
-    def setup(self):
-        goal_time = 0.2
+    goal_time = 0.2
 
+    def setup(self):
         N = 100000
         group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee']
         df = DataFrame({'a': np.random.choice(group1, N).astype('object'),
diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
@@ -211,19 +211,49 @@ def test_categorical_dtype(self):
                                  'c': [3.4, 3.4, 4.5]})
         tm.assert_frame_equal(actual, expected)
 
+        actual = self.read_csv(StringIO(data), dtype={1: 'category'})
+        tm.assert_frame_equal(actual, expected)
+
+        # unsorted
+        data = """a,b,c
+1,b,3.4
+1,b,3.4
+2,a,4.5"""
+        expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
+                                 'b': Categorical.from_codes([0, 0, 1],
+                                                             ['b', 'a']),
+                                 'c': Categorical(['3.4', '3.4', '4.5'])})
+        actual = self.read_csv(StringIO(data), dtype='category')
+        tm.assert_frame_equal(actual, expected)
+
+        # missing
+        data = """a,b,c
+1,b,3.4
+1,nan,3.4
+2,a,4.5"""
+        expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
+                                 'b': Categorical.from_codes([0, -1, 1],
+                                                             ['b', 'a']),
+                                 'c': Categorical(['3.4', '3.4', '4.5'])})
+        actual = self.read_csv(StringIO(data), dtype='category')
+        tm.assert_frame_equal(actual, expected)
+
     def test_categorical_dtype_encoding(self):
         # GH 10153
-        cases = [
-            ('unicode_series.csv', 'latin-1'),
-            ('utf16_ex.txt', 'utf-16')
-        ]
-
-        for f, encoding in cases:
-            pth = tm.get_data_path(f)
-            expected = self.read_csv(pth, header=None, encoding=encoding)
-            result = self.read_csv(pth, header=None, encoding=encoding, dtype='category')
-            result = result.apply(lambda x: x.astype(object))
-            tm.assert_frame_equal(actual, expected)
+        pth = tm.get_data_path('unicode_series.csv')
+        encoding = 'latin-1'
+        expected = self.read_csv(pth, header=None, encoding=encoding)
+        actual = self.read_csv(pth, header=None, encoding=encoding,
+                               dtype={1: 'category'})
+        actual[1] = actual[1].astype(object)
+        tm.assert_frame_equal(actual, expected)
+
+        pth = tm.get_data_path('utf16_ex.txt')
+        encoding = 'utf-16'
+        expected = self.read_table(pth, encoding=encoding)
+        actual = self.read_table(pth, encoding=encoding, dtype='category')
+        actual = actual.apply(lambda x: x.astype(object))
+        tm.assert_frame_equal(actual, expected)
 
     def test_pass_dtype_as_recarray(self):
         if compat.is_platform_windows() and self.low_memory:
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -1527,8 +1527,9 @@ cdef _categorical_convert(parser_t *parser, int col,
 
     lines = line_end - line_start
     codes = np.empty(lines, dtype=np.int64)
+
     # factorize parsed values, creating a hash table
-    # bytes -> category
+    # bytes -> category code
     with nogil:
         table = kh_init_str()
         coliter_setup(&it, parser, col, line_start)
@@ -1554,24 +1555,20 @@ cdef _categorical_convert(parser_t *parser, int col,
             codes[i] = table.vals[k]
 
     # parse and box categories to python strings
-    i = 0
     result = np.empty(table.n_occupied, dtype=np.object_)
     if path == ENCODED:
         for k in range(table.n_buckets):
             if kh_exist_str(table, k):
                 size = strlen(table.keys[k])
-                result[i] = PyUnicode_Decode(table.keys[k], size, encoding, errors)
-                i += 1
+                result[table.vals[k]] = PyUnicode_Decode(table.keys[k], size, encoding, errors)
     elif path == UTF8:
         for k in range(table.n_buckets):
             if kh_exist_str(table, k):
-                result[i] = PyUnicode_FromString(table.keys[k])
-                i += 1
+                result[table.vals[k]] = PyUnicode_FromString(table.keys[k])
     elif path == CSTRING:
         for k in range(table.n_buckets):
             if kh_exist_str(table, k):
-                result[i] = PyBytes_FromString(table.keys[k])
-                i += 1
+                result[table.vals[k]] = PyBytes_FromString(table.keys[k])
 
     kh_destroy_str(table)
     return np.asarray(codes), result, na_count
diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py
@@ -850,6 +850,9 @@ def test_union_categorical(self):
             ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
             ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),
 
+            (['b', 'b', np.nan, 'a'], ['a', np.nan, 'c'],
+             ['b', 'b', np.nan, 'a', 'a', np.nan, 'c']),
+
             (pd.date_range('2014-01-01', '2014-01-05'),
              pd.date_range('2014-01-06', '2014-01-07'),
              pd.date_range('2014-01-01', '2014-01-07')),
diff --git a/pandas/types/concat.py b/pandas/types/concat.py
@@ -240,6 +240,7 @@ def union_categoricals(to_union, sort_categories=False):
         Emmpty list of categoricals passed
     """
     from pandas import Index, Categorical
+    from pandas.core.algorithms import take_1d
 
     if len(to_union) == 0:
         raise ValueError('No Categoricals to union')