data-apis · rgommers · Jul 28, 2022 · Sep 29, 2021 · Jun 1, 2022 · Jul 7, 2022
diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py
@@ -170,23 +170,24 @@ def dtype(self) -> Tuple[enum.IntEnum, int, str, str]:
         pass
 
     @property
-    def describe_categorical(self) -> dict[bool, bool, Optional[dict]]:
+    def describe_categorical(self) -> dict[bool, bool, Optional[Column]]:
         """
         If the dtype is categorical, there are two options:
 
         - There are only values in the data buffer.
-        - There is a separate dictionary-style encoding for categorical values.
+        - There is a separate non-categortical Column encoding categorical values.
 
         Raises RuntimeError if the dtype is not categorical
 
         Content of returned dict:
 
             - "is_ordered" : bool, whether the ordering of dictionary indices is
                              semantically meaningful.
-            - "is_dictionary" : bool, whether a dictionary-style mapping of
+            - "is_dictionary" : bool, whether a mapping of
                                 categorical values to other objects exists
-            - "mapping" : dict, Python-level only (e.g. ``{int: str}``).
-                          None if not a dictionary-style categorical.
+            - "categories" : Column representing the (implicit) mapping of indices to
+                             category values (e.g. an array of cat1, cat2, ...).
+                             None if not a dictionary-style categorical.
 
         TBD: are there any other in-memory representations that are needed?
         """

diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py
@@ -152,7 +152,7 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series:
     # If you want to cheat for testing (can't use `_col` in real-world code):
     #    categories = col._col.values.categories.values
     #    codes = col._col.values.codes
-    categories = np.asarray(list(mapping.values()))
+    categories = convert_column_to_ndarray(mapping)
     codes_buffer, codes_dtype = col.get_buffers()["data"]
     codes = buffer_to_ndarray(codes_buffer, codes_dtype)
     values = categories[codes]
@@ -446,17 +446,17 @@ def describe_categorical(self) -> Dict[str, Any]:
         If the dtype is categorical, there are two options:
 
         - There are only values in the data buffer.
-        - There is a separate dictionary-style encoding for categorical values.
+        - The data buffer stores encoded values, while the (single)
+          child column stores the categorical values themselves.
 
         Raises RuntimeError if the dtype is not categorical
 
         Content of returned dict:
 
             - "is_ordered" : bool, whether the ordering of dictionary indices is
                              semantically meaningful.
-            - "is_dictionary" : bool, whether a dictionary-style mapping of
-                                categorical values to other objects exists
-            - "mapping" : dict, Python-level only (e.g. ``{int: str}``).
+            - "is_dictionary" : bool, whether the data is integer encoded
+            - "mapping" : Column representing the mapping of indices to category values.
                           None if not a dictionary-style categorical.
         """
         if not self.dtype[0] == _DtypeKind.CATEGORICAL:
@@ -465,13 +465,10 @@ def describe_categorical(self) -> Dict[str, Any]:
 
         ordered = self._col.dtype.ordered
         is_dictionary = True
-        # NOTE: this shows the children approach is better, transforming
-        # `categories` to a "mapping" dict is inefficient
-        codes = self._col.values.codes  # ndarray, length `self.size`
-        # categories.values is ndarray of length n_categories
-        categories = self._col.values.categories.values
-        mapping = {ix: val for ix, val in enumerate(categories)}
-        return ordered, is_dictionary, mapping
+        categories = _PandasColumn(self._col.dtype.categories.to_series())
+        return {"is_ordered": ordered,
+                "is_dictionary": is_dictionary,
+                "mapping": categories}
 
     @property
     def describe_null(self) -> Tuple[int, Any]:
@@ -840,7 +837,8 @@ def test_categorical_dtype():
     assert col.null_count == 1
     assert col.describe_null == (2, -1)  # sentinel value -1
     assert col.num_chunks() == 1
-    assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5})
+    assert col.describe_categorical["is_ordered"] == False
+    assert col.describe_categorical["is_dictionary"] == True
 
     df2 = from_dataframe(df)
     assert_dataframe_equal(df.__dataframe__(), df)