From e429a5063b465b228405ecd6e0c08af5d6ff3aaf Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 7 Sep 2021 11:35:52 +0200 Subject: [PATCH 1/4] Added _format_str: mapping NumPy to Arrow format strings --- protocol/pandas_implementation.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index a016d25c..e0ef2590 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -435,10 +435,32 @@ def _dtype_from_pandasdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]: raise NotImplementedError(f"Data type {dtype} not handled yet") bitwidth = dtype.itemsize * 8 - format_str = dtype.str + format_str = self._format_str(dtype.str) endianness = dtype.byteorder if not kind == _k.CATEGORICAL else '=' return (kind, bitwidth, format_str, endianness) + def _format_str(self, format_str) -> str: + """ + Mapping of NumPy formt strings to + Apache Arrow C Data Interface format strings. + 'O' categorical mapped as 'U': large utf-8 string for now + """ + _ints = {8: 'c', 16: 's', 32: 'i', 64: 'l'} + _uints = {8: 'C', 16: 'S', 32: 'I', 64: 'L'} + _floats = {16: 'e', 32: 'f', 64: 'g'} + _np_dtypes = {'i': _ints, 'u': _uints, 'f': _floats, 'b': {8: 'b'}, 'O': {64: 'U'}} + + dt = np.dtype(format_str) + if dt.byteorder == '>': + raise ValueError(f"Big-endian not supported by exchange" + "protocol") + + arrow_format_str = _np_dtypes.get(dt.kind, {}).get(dt.itemsize*8) + + if arrow_format_str is None: + raise NotImplementedError(f"Format string {format_str} not handled yet") + + return arrow_format_str @property def describe_categorical(self) -> Dict[str, Any]: From 1dd21bbb2e0fb595459eb25feaa4a51088904bd9 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 7 Sep 2021 11:58:30 +0200 Subject: [PATCH 2/4] Added test for _format_str --- protocol/pandas_implementation.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index e0ef2590..a455bc51 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -902,6 +902,12 @@ def test_metadata(): assert_dataframe_equal(df.__dataframe__(), df) tm.assert_frame_equal(df, df2) +def test_fromat_str(): + df = pd.DataFrame(data=dict(a=[1, 2, 3], B=[3, 4, 5], + c=[1.5, 2.5, 3.5], D=["a", "b", "cdef"])) + df["B"] = df["B"].astype("category") + df["D"] = df["D"].astype("object") + df2 = from_dataframe(df) if __name__ == '__main__': test_categorical_dtype() From 8f43391d4c81aeddc5be31561c70473abb9df4ca Mon Sep 17 00:00:00 2001 From: Athan Date: Tue, 7 Sep 2021 10:15:00 -0700 Subject: [PATCH 3/4] Fix typo --- protocol/pandas_implementation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index a455bc51..bb73e5ca 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -441,7 +441,7 @@ def _dtype_from_pandasdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]: def _format_str(self, format_str) -> str: """ - Mapping of NumPy formt strings to + Mapping of NumPy format strings to Apache Arrow C Data Interface format strings. 'O' categorical mapped as 'U': large utf-8 string for now """ From 82066208be8943fa4292d6bf7400d1dbb0c58744 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 8 Sep 2021 10:15:14 +0200 Subject: [PATCH 4/4] Fix incomplete test --- protocol/pandas_implementation.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index a455bc51..a4239326 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -907,7 +907,11 @@ def test_fromat_str(): c=[1.5, 2.5, 3.5], D=["a", "b", "cdef"])) df["B"] = df["B"].astype("category") df["D"] = df["D"].astype("object") - df2 = from_dataframe(df) + + format_strings = {'a': 'l', 'B': 'U', 'c': 'g', 'D': 'u'} + for col in df.columns.tolist(): + column = _PandasColumn(df[col]) + assert column.dtype[2] == format_strings[col] if __name__ == '__main__': test_categorical_dtype()