From d31a41320d51d5285679c0532f39d2151f259486 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 21 Apr 2023 09:44:44 +0100 Subject: [PATCH 1/9] support bitmasks in interchange --- doc/source/whatsnew/v2.0.1.rst | 1 + pandas/core/interchange/from_dataframe.py | 20 ++++++++++++++------ pandas/tests/interchange/test_impl.py | 15 +++++++++++++++ 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 92731426bed03..e42d2de75f478 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -28,6 +28,7 @@ Bug fixes - Bug in :attr:`Series.dt.days` that would overflow ``int32`` number of days (:issue:`52391`) - Bug in :class:`arrays.DatetimeArray` constructor returning an incorrect unit when passed a non-nanosecond numpy datetime array (:issue:`52555`) - Bug in :func:`Series.median` with :class:`ArrowDtype` returning an approximate median (:issue:`52679`) +- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on categorical dtypes (:issue:`49889`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on large string dtypes (:issue:`52795`) - Bug in :func:`pandas.testing.assert_series_equal` where ``check_dtype=False`` would still raise for datetime or timedelta types with different resolutions (:issue:`52449`) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 998f3bc374942..3f9cf998fc97c 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -6,6 +6,8 @@ import numpy as np +from pandas.compat._optional import import_optional_dependency + import pandas as pd from pandas.core.interchange.dataframe_protocol import ( Buffer, @@ -23,7 +25,7 @@ DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}, DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64}, DtypeKind.FLOAT: {32: np.float32, 64: np.float64}, - DtypeKind.BOOL: {8: bool}, + DtypeKind.BOOL: {1: bool, 8: bool}, } @@ -406,15 +408,21 @@ def buffer_to_ndarray( # and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports # it since https://github.com/numpy/numpy/pull/19083 ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype) - data_pointer = ctypes.cast( - buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type) - ) if bit_width == 1: assert length is not None, "`length` must be specified for a bit-mask buffer." - arr = np.ctypeslib.as_array(data_pointer, shape=(buffer.bufsize,)) - return bitmask_to_bool_ndarray(arr, length, first_byte_offset=offset % 8) + pa = import_optional_dependency("pyarrow") + arr = pa.BooleanArray.from_buffers( + pa.bool_(), + length, + [None, pa.foreign_buffer(buffer.ptr, length)], + offset=offset, + ) + return np.asarray(arr) else: + data_pointer = ctypes.cast( + buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type) + ) return np.ctypeslib.as_array( data_pointer, shape=(buffer.bufsize // (bit_width // 8),) ) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index a9835b8641e7d..1a640b81cc788 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -104,6 +104,21 @@ def test_large_string_pyarrow(): assert pa.Table.equals(pa.interchange.from_dataframe(result), table) +def test_bitmasks_pyarrow(): + # GH 52795 + pa = pytest.importorskip("pyarrow", "11.0.0") + + arr = [3.3, None, 2.1] + table = pa.table({"arr": arr}) + exchange_df = table.__dataframe__() + result = from_dataframe(exchange_df) + expected = pd.DataFrame({"arr": [3.3, float("nan"), 2.1]}) + tm.assert_frame_equal(result, expected) + + # check round-trip + assert pa.Table.equals(pa.interchange.from_dataframe(result), table) + + @pytest.mark.parametrize( "data", [int_data, uint_data, float_data, bool_data, datetime_data] ) From b8e350ab0628619830d5c5bdee4a7cb66345746d Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 21 Apr 2023 09:46:25 +0100 Subject: [PATCH 2/9] remove dead code --- pandas/core/interchange/from_dataframe.py | 54 ----------------------- 1 file changed, 54 deletions(-) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 3f9cf998fc97c..dccb8247ca195 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -428,60 +428,6 @@ def buffer_to_ndarray( ) -def bitmask_to_bool_ndarray( - bitmask: np.ndarray, mask_length: int, first_byte_offset: int = 0 -) -> np.ndarray: - """ - Convert bit-mask to a boolean NumPy array. - - Parameters - ---------- - bitmask : np.ndarray[uint8] - NumPy array of uint8 dtype representing the bitmask. - mask_length : int - Number of elements in the mask to interpret. - first_byte_offset : int, default: 0 - Number of elements to offset from the start of the first byte. - - Returns - ------- - np.ndarray[bool] - """ - bytes_to_skip = first_byte_offset // 8 - bitmask = bitmask[bytes_to_skip:] - first_byte_offset %= 8 - - bool_mask = np.zeros(mask_length, dtype=bool) - - # Processing the first byte separately as it has its own offset - val = bitmask[0] - mask_idx = 0 - bits_in_first_byte = min(8 - first_byte_offset, mask_length) - for j in range(bits_in_first_byte): - if val & (1 << (j + first_byte_offset)): - bool_mask[mask_idx] = True - mask_idx += 1 - - # `mask_length // 8` describes how many full bytes to process - for i in range((mask_length - bits_in_first_byte) // 8): - # doing `+ 1` as we already processed the first byte - val = bitmask[i + 1] - for j in range(8): - if val & (1 << j): - bool_mask[mask_idx] = True - mask_idx += 1 - - if len(bitmask) > 1: - # Processing reminder of last byte - val = bitmask[-1] - for j in range(len(bool_mask) - mask_idx): - if val & (1 << j): - bool_mask[mask_idx] = True - mask_idx += 1 - - return bool_mask - - def set_nulls( data: np.ndarray | pd.Series, col: Column, From 010b521e1e6da9f4a9d433fa00b2c5299158bce0 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 21 Apr 2023 10:53:31 +0100 Subject: [PATCH 3/9] fixup for slice, add tests --- pandas/core/interchange/from_dataframe.py | 5 +++-- pandas/tests/interchange/test_impl.py | 17 ++++++++++++++--- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index dccb8247ca195..9c5f7dc8a53cb 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -254,7 +254,7 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: Endianness.NATIVE, ) # Specify zero offset as we don't want to chunk the string data - data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=col.size()) + data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize) # Retrieve the offsets buffer containing the index offsets demarcating # the beginning and the ending of each string @@ -424,7 +424,8 @@ def buffer_to_ndarray( buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type) ) return np.ctypeslib.as_array( - data_pointer, shape=(buffer.bufsize // (bit_width // 8),) + data_pointer, + shape=(length,), ) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 1a640b81cc788..d393ba6fd3957 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -104,15 +104,26 @@ def test_large_string_pyarrow(): assert pa.Table.equals(pa.interchange.from_dataframe(result), table) -def test_bitmasks_pyarrow(): +@pytest.mark.parametrize( + ("offset", "length", "expected_values"), + [ + (0, None, [3.3, float("nan"), 2.1]), + (1, None, [float("nan"), 2.1]), + (2, None, [2.1]), + (0, 2, [3.3, float("nan")]), + (0, 1, [3.3]), + (1, 1, [float("nan")]), + ], +) +def test_bitmasks_pyarrow(offset, length, expected_values): # GH 52795 pa = pytest.importorskip("pyarrow", "11.0.0") arr = [3.3, None, 2.1] - table = pa.table({"arr": arr}) + table = pa.table({"arr": arr}).slice(offset, length) exchange_df = table.__dataframe__() result = from_dataframe(exchange_df) - expected = pd.DataFrame({"arr": [3.3, float("nan"), 2.1]}) + expected = pd.DataFrame({"arr": expected_values}) tm.assert_frame_equal(result, expected) # check round-trip From 4eb53f2ab008e7a2486f702dab2bfbea66bdcf3a Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 21 Apr 2023 12:25:37 +0100 Subject: [PATCH 4/9] tighten typing --- pandas/core/interchange/from_dataframe.py | 25 +++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 9c5f7dc8a53cb..45d6bdd7917c1 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -156,7 +156,9 @@ def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: buffers = col.get_buffers() data_buff, data_dtype = buffers["data"] - data = buffer_to_ndarray(data_buff, data_dtype, col.offset, col.size()) + data = buffer_to_ndarray( + data_buff, data_dtype, offset=col.offset, length=col.size() + ) data = set_nulls(data, col, buffers["validity"]) return data, buffers @@ -194,7 +196,9 @@ def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]: buffers = col.get_buffers() codes_buff, codes_dtype = buffers["data"] - codes = buffer_to_ndarray(codes_buff, codes_dtype, col.offset, col.size()) + codes = buffer_to_ndarray( + codes_buff, codes_dtype, offset=col.offset, length=col.size() + ) # Doing module in order to not get ``IndexError`` for # out-of-bounds sentinel values in `codes` @@ -263,14 +267,16 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: # meaning that it has more elements than in the data buffer, do `col.size() + 1` # here to pass a proper offsets buffer size offsets = buffer_to_ndarray( - offset_buff, offset_dtype, col.offset, length=col.size() + 1 + offset_buff, offset_dtype, offset=col.offset, length=col.size() + 1 ) null_pos = None if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): assert buffers["validity"], "Validity buffers cannot be empty for masks" valid_buff, valid_dtype = buffers["validity"] - null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size()) + null_pos = buffer_to_ndarray( + valid_buff, valid_dtype, offset=col.offset, length=col.size() + ) if sentinel_val == 0: null_pos = ~null_pos @@ -358,8 +364,8 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: getattr(ArrowCTypes, f"UINT{dtype[1]}"), Endianness.NATIVE, ), - col.offset, - col.size(), + offset=col.offset, + length=col.size(), ) data = parse_datetime_format_str(format_str, data) @@ -370,8 +376,9 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: def buffer_to_ndarray( buffer: Buffer, dtype: tuple[DtypeKind, int, str, str], + *, + length: int, offset: int = 0, - length: int | None = None, ) -> np.ndarray: """ Build a NumPy array from the passed buffer. @@ -464,7 +471,9 @@ def set_nulls( elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): assert validity, "Expected to have a validity buffer for the mask" valid_buff, valid_dtype = validity - null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size()) + null_pos = buffer_to_ndarray( + valid_buff, valid_dtype, offset=col.offset, length=col.size() + ) if sentinel_val == 0: null_pos = ~null_pos elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN): From 64d93439dee7e3c0e4c47e7554ff711d8d194310 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 21 Apr 2023 13:23:37 +0100 Subject: [PATCH 5/9] reduce diff --- pandas/core/interchange/from_dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 45d6bdd7917c1..eef897397b058 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -258,7 +258,7 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: Endianness.NATIVE, ) # Specify zero offset as we don't want to chunk the string data - data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize) + data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=col.size()) # Retrieve the offsets buffer containing the index offsets demarcating # the beginning and the ending of each string From a891ad8728efce45f666aec27bbf68b8cbdb2c18 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 21 Apr 2023 14:03:39 +0100 Subject: [PATCH 6/9] post-merge fixup --- pandas/core/interchange/from_dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index eef897397b058..45d6bdd7917c1 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -258,7 +258,7 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: Endianness.NATIVE, ) # Specify zero offset as we don't want to chunk the string data - data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=col.size()) + data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize) # Retrieve the offsets buffer containing the index offsets demarcating # the beginning and the ending of each string From 0dfeb85720eb272de2b6d511f0a412db1950e99d Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sat, 22 Apr 2023 09:42:42 +0100 Subject: [PATCH 7/9] add new whatsnew note --- doc/source/whatsnew/v2.0.1.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 915beb7a3130a..57b52ec30c34e 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -28,6 +28,7 @@ Bug fixes - Bug in :attr:`Series.dt.days` that would overflow ``int32`` number of days (:issue:`52391`) - Bug in :class:`arrays.DatetimeArray` constructor returning an incorrect unit when passed a non-nanosecond numpy datetime array (:issue:`52555`) - Bug in :func:`Series.median` with :class:`ArrowDtype` returning an approximate median (:issue:`52679`) +- Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on categorical dtypes (:issue:`49889`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on large string dtypes (:issue:`52795`) From ec72f377deef4851989be98528173b232e38b5b1 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Mon, 24 Apr 2023 20:04:13 +0100 Subject: [PATCH 8/9] move to 2.0.2 --- doc/source/whatsnew/v2.0.2.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index 0a6738cb9b3dc..09932a2d2d571 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -20,6 +20,8 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`) +- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`) - .. --------------------------------------------------------------------------- From d8adf1f6f09d65bc500e39a022686435406cab88 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Mon, 24 Apr 2023 20:04:44 +0100 Subject: [PATCH 9/9] revert --- doc/source/whatsnew/v2.0.1.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 386294ad8a72a..2613d12e43400 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -32,8 +32,6 @@ Bug fixes - Bug in :class:`~arrays.ArrowExtensionArray` with duration dtype overflowing when constructed from data containing numpy ``NaT`` (:issue:`52843`) - Bug in :func:`Series.dt.round` when passing a ``freq`` of equal or higher resolution compared to the :class:`Series` would raise a ``ZeroDivisionError`` (:issue:`52761`) - Bug in :func:`Series.median` with :class:`ArrowDtype` returning an approximate median (:issue:`52679`) -- Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`) -- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on categorical dtypes (:issue:`49889`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on large string dtypes (:issue:`52795`) - Bug in :func:`pandas.testing.assert_series_equal` where ``check_dtype=False`` would still raise for datetime or timedelta types with different resolutions (:issue:`52449`)