From d31a41320d51d5285679c0532f39d2151f259486 Mon Sep 17 00:00:00 2001
From: MarcoGorelli <>
Date: Fri, 21 Apr 2023 09:44:44 +0100
Subject: [PATCH 1/9] support bitmasks in interchange

---
 doc/source/whatsnew/v2.0.1.rst            |  1 +
 pandas/core/interchange/from_dataframe.py | 20 ++++++++++++++------
 pandas/tests/interchange/test_impl.py     | 15 +++++++++++++++
 3 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst
index 92731426bed03..e42d2de75f478 100644
--- a/doc/source/whatsnew/v2.0.1.rst
+++ b/doc/source/whatsnew/v2.0.1.rst
@@ -28,6 +28,7 @@ Bug fixes
 - Bug in :attr:`Series.dt.days` that would overflow ``int32`` number of days (:issue:`52391`)
 - Bug in :class:`arrays.DatetimeArray` constructor returning an incorrect unit when passed a non-nanosecond numpy datetime array (:issue:`52555`)
 - Bug in :func:`Series.median` with :class:`ArrowDtype` returning an approximate median (:issue:`52679`)
+- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`)
 - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on categorical dtypes (:issue:`49889`)
 - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on large string dtypes (:issue:`52795`)
 - Bug in :func:`pandas.testing.assert_series_equal` where ``check_dtype=False`` would still raise for datetime or timedelta types with different resolutions (:issue:`52449`)
diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py
index 998f3bc374942..3f9cf998fc97c 100644
--- a/pandas/core/interchange/from_dataframe.py
+++ b/pandas/core/interchange/from_dataframe.py
@@ -6,6 +6,8 @@
 
 import numpy as np
 
+from pandas.compat._optional import import_optional_dependency
+
 import pandas as pd
 from pandas.core.interchange.dataframe_protocol import (
     Buffer,
@@ -23,7 +25,7 @@
     DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64},
     DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64},
     DtypeKind.FLOAT: {32: np.float32, 64: np.float64},
-    DtypeKind.BOOL: {8: bool},
+    DtypeKind.BOOL: {1: bool, 8: bool},
 }
 
 
@@ -406,15 +408,21 @@ def buffer_to_ndarray(
     # and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports
     # it since https://github.com/numpy/numpy/pull/19083
     ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype)
-    data_pointer = ctypes.cast(
-        buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type)
-    )
 
     if bit_width == 1:
         assert length is not None, "`length` must be specified for a bit-mask buffer."
-        arr = np.ctypeslib.as_array(data_pointer, shape=(buffer.bufsize,))
-        return bitmask_to_bool_ndarray(arr, length, first_byte_offset=offset % 8)
+        pa = import_optional_dependency("pyarrow")
+        arr = pa.BooleanArray.from_buffers(
+            pa.bool_(),
+            length,
+            [None, pa.foreign_buffer(buffer.ptr, length)],
+            offset=offset,
+        )
+        return np.asarray(arr)
     else:
+        data_pointer = ctypes.cast(
+            buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type)
+        )
         return np.ctypeslib.as_array(
             data_pointer, shape=(buffer.bufsize // (bit_width // 8),)
         )
diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py
index a9835b8641e7d..1a640b81cc788 100644
--- a/pandas/tests/interchange/test_impl.py
+++ b/pandas/tests/interchange/test_impl.py
@@ -104,6 +104,21 @@ def test_large_string_pyarrow():
     assert pa.Table.equals(pa.interchange.from_dataframe(result), table)
 
 
+def test_bitmasks_pyarrow():
+    # GH 52795
+    pa = pytest.importorskip("pyarrow", "11.0.0")
+
+    arr = [3.3, None, 2.1]
+    table = pa.table({"arr": arr})
+    exchange_df = table.__dataframe__()
+    result = from_dataframe(exchange_df)
+    expected = pd.DataFrame({"arr": [3.3, float("nan"), 2.1]})
+    tm.assert_frame_equal(result, expected)
+
+    # check round-trip
+    assert pa.Table.equals(pa.interchange.from_dataframe(result), table)
+
+
 @pytest.mark.parametrize(
     "data", [int_data, uint_data, float_data, bool_data, datetime_data]
 )

From b8e350ab0628619830d5c5bdee4a7cb66345746d Mon Sep 17 00:00:00 2001
From: MarcoGorelli <>
Date: Fri, 21 Apr 2023 09:46:25 +0100
Subject: [PATCH 2/9] remove dead code

---
 pandas/core/interchange/from_dataframe.py | 54 -----------------------
 1 file changed, 54 deletions(-)

diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py
index 3f9cf998fc97c..dccb8247ca195 100644
--- a/pandas/core/interchange/from_dataframe.py
+++ b/pandas/core/interchange/from_dataframe.py
@@ -428,60 +428,6 @@ def buffer_to_ndarray(
         )
 
 
-def bitmask_to_bool_ndarray(
-    bitmask: np.ndarray, mask_length: int, first_byte_offset: int = 0
-) -> np.ndarray:
-    """
-    Convert bit-mask to a boolean NumPy array.
-
-    Parameters
-    ----------
-    bitmask : np.ndarray[uint8]
-        NumPy array of uint8 dtype representing the bitmask.
-    mask_length : int
-        Number of elements in the mask to interpret.
-    first_byte_offset : int, default: 0
-        Number of elements to offset from the start of the first byte.
-
-    Returns
-    -------
-    np.ndarray[bool]
-    """
-    bytes_to_skip = first_byte_offset // 8
-    bitmask = bitmask[bytes_to_skip:]
-    first_byte_offset %= 8
-
-    bool_mask = np.zeros(mask_length, dtype=bool)
-
-    # Processing the first byte separately as it has its own offset
-    val = bitmask[0]
-    mask_idx = 0
-    bits_in_first_byte = min(8 - first_byte_offset, mask_length)
-    for j in range(bits_in_first_byte):
-        if val & (1 << (j + first_byte_offset)):
-            bool_mask[mask_idx] = True
-        mask_idx += 1
-
-    # `mask_length // 8` describes how many full bytes to process
-    for i in range((mask_length - bits_in_first_byte) // 8):
-        # doing `+ 1` as we already processed the first byte
-        val = bitmask[i + 1]
-        for j in range(8):
-            if val & (1 << j):
-                bool_mask[mask_idx] = True
-            mask_idx += 1
-
-    if len(bitmask) > 1:
-        # Processing reminder of last byte
-        val = bitmask[-1]
-        for j in range(len(bool_mask) - mask_idx):
-            if val & (1 << j):
-                bool_mask[mask_idx] = True
-            mask_idx += 1
-
-    return bool_mask
-
-
 def set_nulls(
     data: np.ndarray | pd.Series,
     col: Column,

From 010b521e1e6da9f4a9d433fa00b2c5299158bce0 Mon Sep 17 00:00:00 2001
From: MarcoGorelli <>
Date: Fri, 21 Apr 2023 10:53:31 +0100
Subject: [PATCH 3/9] fixup for slice, add tests

---
 pandas/core/interchange/from_dataframe.py |  5 +++--
 pandas/tests/interchange/test_impl.py     | 17 ++++++++++++++---
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py
index dccb8247ca195..9c5f7dc8a53cb 100644
--- a/pandas/core/interchange/from_dataframe.py
+++ b/pandas/core/interchange/from_dataframe.py
@@ -254,7 +254,7 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
         Endianness.NATIVE,
     )
     # Specify zero offset as we don't want to chunk the string data
-    data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=col.size())
+    data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize)
 
     # Retrieve the offsets buffer containing the index offsets demarcating
     # the beginning and the ending of each string
@@ -424,7 +424,8 @@ def buffer_to_ndarray(
             buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type)
         )
         return np.ctypeslib.as_array(
-            data_pointer, shape=(buffer.bufsize // (bit_width // 8),)
+            data_pointer,
+            shape=(length,),
         )
 
 
diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py
index 1a640b81cc788..d393ba6fd3957 100644
--- a/pandas/tests/interchange/test_impl.py
+++ b/pandas/tests/interchange/test_impl.py
@@ -104,15 +104,26 @@ def test_large_string_pyarrow():
     assert pa.Table.equals(pa.interchange.from_dataframe(result), table)
 
 
-def test_bitmasks_pyarrow():
+@pytest.mark.parametrize(
+    ("offset", "length", "expected_values"),
+    [
+        (0, None, [3.3, float("nan"), 2.1]),
+        (1, None, [float("nan"), 2.1]),
+        (2, None, [2.1]),
+        (0, 2, [3.3, float("nan")]),
+        (0, 1, [3.3]),
+        (1, 1, [float("nan")]),
+    ],
+)
+def test_bitmasks_pyarrow(offset, length, expected_values):
     # GH 52795
     pa = pytest.importorskip("pyarrow", "11.0.0")
 
     arr = [3.3, None, 2.1]
-    table = pa.table({"arr": arr})
+    table = pa.table({"arr": arr}).slice(offset, length)
     exchange_df = table.__dataframe__()
     result = from_dataframe(exchange_df)
-    expected = pd.DataFrame({"arr": [3.3, float("nan"), 2.1]})
+    expected = pd.DataFrame({"arr": expected_values})
     tm.assert_frame_equal(result, expected)
 
     # check round-trip

From 4eb53f2ab008e7a2486f702dab2bfbea66bdcf3a Mon Sep 17 00:00:00 2001
From: MarcoGorelli <>
Date: Fri, 21 Apr 2023 12:25:37 +0100
Subject: [PATCH 4/9] tighten typing

---
 pandas/core/interchange/from_dataframe.py | 25 +++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py
index 9c5f7dc8a53cb..45d6bdd7917c1 100644
--- a/pandas/core/interchange/from_dataframe.py
+++ b/pandas/core/interchange/from_dataframe.py
@@ -156,7 +156,9 @@ def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
     buffers = col.get_buffers()
 
     data_buff, data_dtype = buffers["data"]
-    data = buffer_to_ndarray(data_buff, data_dtype, col.offset, col.size())
+    data = buffer_to_ndarray(
+        data_buff, data_dtype, offset=col.offset, length=col.size()
+    )
 
     data = set_nulls(data, col, buffers["validity"])
     return data, buffers
@@ -194,7 +196,9 @@ def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]:
     buffers = col.get_buffers()
 
     codes_buff, codes_dtype = buffers["data"]
-    codes = buffer_to_ndarray(codes_buff, codes_dtype, col.offset, col.size())
+    codes = buffer_to_ndarray(
+        codes_buff, codes_dtype, offset=col.offset, length=col.size()
+    )
 
     # Doing module in order to not get ``IndexError`` for
     # out-of-bounds sentinel values in `codes`
@@ -263,14 +267,16 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
     # meaning that it has more elements than in the data buffer, do `col.size() + 1`
     # here to pass a proper offsets buffer size
     offsets = buffer_to_ndarray(
-        offset_buff, offset_dtype, col.offset, length=col.size() + 1
+        offset_buff, offset_dtype, offset=col.offset, length=col.size() + 1
     )
 
     null_pos = None
     if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
         assert buffers["validity"], "Validity buffers cannot be empty for masks"
         valid_buff, valid_dtype = buffers["validity"]
-        null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size())
+        null_pos = buffer_to_ndarray(
+            valid_buff, valid_dtype, offset=col.offset, length=col.size()
+        )
         if sentinel_val == 0:
             null_pos = ~null_pos
 
@@ -358,8 +364,8 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
             getattr(ArrowCTypes, f"UINT{dtype[1]}"),
             Endianness.NATIVE,
         ),
-        col.offset,
-        col.size(),
+        offset=col.offset,
+        length=col.size(),
     )
 
     data = parse_datetime_format_str(format_str, data)
@@ -370,8 +376,9 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
 def buffer_to_ndarray(
     buffer: Buffer,
     dtype: tuple[DtypeKind, int, str, str],
+    *,
+    length: int,
     offset: int = 0,
-    length: int | None = None,
 ) -> np.ndarray:
     """
     Build a NumPy array from the passed buffer.
@@ -464,7 +471,9 @@ def set_nulls(
     elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
         assert validity, "Expected to have a validity buffer for the mask"
         valid_buff, valid_dtype = validity
-        null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size())
+        null_pos = buffer_to_ndarray(
+            valid_buff, valid_dtype, offset=col.offset, length=col.size()
+        )
         if sentinel_val == 0:
             null_pos = ~null_pos
     elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN):

From 64d93439dee7e3c0e4c47e7554ff711d8d194310 Mon Sep 17 00:00:00 2001
From: MarcoGorelli <>
Date: Fri, 21 Apr 2023 13:23:37 +0100
Subject: [PATCH 5/9] reduce diff

---
 pandas/core/interchange/from_dataframe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py
index 45d6bdd7917c1..eef897397b058 100644
--- a/pandas/core/interchange/from_dataframe.py
+++ b/pandas/core/interchange/from_dataframe.py
@@ -258,7 +258,7 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
         Endianness.NATIVE,
     )
     # Specify zero offset as we don't want to chunk the string data
-    data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize)
+    data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=col.size())
 
     # Retrieve the offsets buffer containing the index offsets demarcating
     # the beginning and the ending of each string

From a891ad8728efce45f666aec27bbf68b8cbdb2c18 Mon Sep 17 00:00:00 2001
From: MarcoGorelli <>
Date: Fri, 21 Apr 2023 14:03:39 +0100
Subject: [PATCH 6/9] post-merge fixup

---
 pandas/core/interchange/from_dataframe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py
index eef897397b058..45d6bdd7917c1 100644
--- a/pandas/core/interchange/from_dataframe.py
+++ b/pandas/core/interchange/from_dataframe.py
@@ -258,7 +258,7 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
         Endianness.NATIVE,
     )
     # Specify zero offset as we don't want to chunk the string data
-    data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=col.size())
+    data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize)
 
     # Retrieve the offsets buffer containing the index offsets demarcating
     # the beginning and the ending of each string

From 0dfeb85720eb272de2b6d511f0a412db1950e99d Mon Sep 17 00:00:00 2001
From: MarcoGorelli <>
Date: Sat, 22 Apr 2023 09:42:42 +0100
Subject: [PATCH 7/9] add new whatsnew note

---
 doc/source/whatsnew/v2.0.1.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst
index 915beb7a3130a..57b52ec30c34e 100644
--- a/doc/source/whatsnew/v2.0.1.rst
+++ b/doc/source/whatsnew/v2.0.1.rst
@@ -28,6 +28,7 @@ Bug fixes
 - Bug in :attr:`Series.dt.days` that would overflow ``int32`` number of days (:issue:`52391`)
 - Bug in :class:`arrays.DatetimeArray` constructor returning an incorrect unit when passed a non-nanosecond numpy datetime array (:issue:`52555`)
 - Bug in :func:`Series.median` with :class:`ArrowDtype` returning an approximate median (:issue:`52679`)
+- Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`)
 - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`)
 - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on categorical dtypes (:issue:`49889`)
 - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on large string dtypes (:issue:`52795`)

From ec72f377deef4851989be98528173b232e38b5b1 Mon Sep 17 00:00:00 2001
From: MarcoGorelli <>
Date: Mon, 24 Apr 2023 20:04:13 +0100
Subject: [PATCH 8/9] move to 2.0.2

---
 doc/source/whatsnew/v2.0.2.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst
index 0a6738cb9b3dc..09932a2d2d571 100644
--- a/doc/source/whatsnew/v2.0.2.rst
+++ b/doc/source/whatsnew/v2.0.2.rst
@@ -20,6 +20,8 @@ Fixed regressions
 
 Bug fixes
 ~~~~~~~~~
+- Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`)
+- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`)
 -
 
 .. ---------------------------------------------------------------------------

From d8adf1f6f09d65bc500e39a022686435406cab88 Mon Sep 17 00:00:00 2001
From: MarcoGorelli <>
Date: Mon, 24 Apr 2023 20:04:44 +0100
Subject: [PATCH 9/9] revert

---
 doc/source/whatsnew/v2.0.1.rst | 2 --
 1 file changed, 2 deletions(-)

diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst
index 386294ad8a72a..2613d12e43400 100644
--- a/doc/source/whatsnew/v2.0.1.rst
+++ b/doc/source/whatsnew/v2.0.1.rst
@@ -32,8 +32,6 @@ Bug fixes
 - Bug in :class:`~arrays.ArrowExtensionArray` with duration dtype overflowing when constructed from data containing numpy ``NaT`` (:issue:`52843`)
 - Bug in :func:`Series.dt.round` when passing a ``freq`` of equal or higher resolution compared to the :class:`Series` would raise a ``ZeroDivisionError`` (:issue:`52761`)
 - Bug in :func:`Series.median` with :class:`ArrowDtype` returning an approximate median (:issue:`52679`)
-- Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`)
-- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`)
 - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on categorical dtypes (:issue:`49889`)
 - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on large string dtypes (:issue:`52795`)
 - Bug in :func:`pandas.testing.assert_series_equal` where ``check_dtype=False`` would still raise for datetime or timedelta types with different resolutions (:issue:`52449`)