diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index a016d25c..16fdf432 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -435,10 +435,32 @@ def _dtype_from_pandasdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]: raise NotImplementedError(f"Data type {dtype} not handled yet") bitwidth = dtype.itemsize * 8 - format_str = dtype.str + format_str = self._format_str(dtype.str) endianness = dtype.byteorder if not kind == _k.CATEGORICAL else '=' return (kind, bitwidth, format_str, endianness) + def _format_str(self, format_str) -> str: + """ + Mapping of NumPy format strings to + Apache Arrow C Data Interface format strings. + 'O' categorical mapped as 'U': large utf-8 string for now + """ + _ints = {8: 'c', 16: 's', 32: 'i', 64: 'l'} + _uints = {8: 'C', 16: 'S', 32: 'I', 64: 'L'} + _floats = {16: 'e', 32: 'f', 64: 'g'} + _np_dtypes = {'i': _ints, 'u': _uints, 'f': _floats, 'b': {8: 'b'}, 'O': {64: 'U'}} + + dt = np.dtype(format_str) + if dt.byteorder == '>': + raise ValueError(f"Big-endian not supported by exchange" + "protocol") + + arrow_format_str = _np_dtypes.get(dt.kind, {}).get(dt.itemsize*8) + + if arrow_format_str is None: + raise NotImplementedError(f"Format string {format_str} not handled yet") + + return arrow_format_str @property def describe_categorical(self) -> Dict[str, Any]: @@ -880,6 +902,16 @@ def test_metadata(): assert_dataframe_equal(df.__dataframe__(), df) tm.assert_frame_equal(df, df2) +def test_fromat_str(): + df = pd.DataFrame(data=dict(a=[1, 2, 3], B=[3, 4, 5], + c=[1.5, 2.5, 3.5], D=["a", "b", "cdef"])) + df["B"] = df["B"].astype("category") + df["D"] = df["D"].astype("object") + + format_strings = {'a': 'l', 'B': 'U', 'c': 'g', 'D': 'u'} + for col in df.columns.tolist(): + column = _PandasColumn(df[col]) + assert column.dtype[2] == format_strings[col] if __name__ == '__main__': test_categorical_dtype()