From 541bd57499d9c109550a03f14a1a85682365a7e8 Mon Sep 17 00:00:00 2001
From: Stephannie Jimenez <steff456@hotmail.com>
Date: Thu, 24 Jun 2021 00:52:02 -0500
Subject: [PATCH 1/5] Add allow-copy flag

This PR adds a flag to throw an exception if the export cannot be
zero-copy. (e.g. for pandas, possible due to block manager where rows
are contiguous and columns are not) .

- Add `allow_zero_copy` flag to the DataFrame class.
- Propagate the flag to the buffer and raise a `RuntimeError` when it is true
- Fix `test_noncontiguous_columns`
- Make update in the requirements doc
---
 protocol/dataframe_protocol.py    |  9 ++++-
 protocol/pandas_implementation.py | 62 +++++++++++++++++++------------
 2 files changed, 47 insertions(+), 24 deletions(-)

diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py
index 41b2e6e4..845c2252 100644
--- a/protocol/dataframe_protocol.py
+++ b/protocol/dataframe_protocol.py
@@ -347,7 +347,8 @@ class DataFrame:
     ``__dataframe__`` method of a public data frame class in a library adhering
     to the dataframe interchange protocol specification.
     """
-    def __dataframe__(self, nan_as_null : bool = False) -> dict:
+    def __dataframe__(self, nan_as_null : bool = False,
+                      allow_copy : bool = True) -> dict:
         """
         Produces a dictionary object following the dataframe protocol specification.
 
@@ -355,8 +356,14 @@ def __dataframe__(self, nan_as_null : bool = False) -> dict:
         producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
         It is intended for cases where the consumer does not support the bit
         mask or byte mask that is the producer's native representation.
+
+        ``allow_copy`` is a keyword that defines whether or not the library is
+        allowed to make a copy of the data. This can for example happen if a
+        library supports strided buffers (those would need a copy, because this
+        protocol specifies contiguous buffers).
         """
         self._nan_as_null = nan_as_null
+        self._allow_zero_zopy = allow_copy
         return {
             "dataframe": self,  # DataFrame object adhering to the protocol
             "version": 0        # Version number of the protocol
diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py
index 30ff9052..e4490d03 100644
--- a/protocol/pandas_implementation.py
+++ b/protocol/pandas_implementation.py
@@ -35,7 +35,8 @@
 ColumnObject = Any
 
 
-def from_dataframe(df : DataFrameObject) -> pd.DataFrame:
+def from_dataframe(df : DataFrameObject,
+                   allow_copy : bool = True) -> pd.DataFrame:
     """
     Construct a pandas DataFrame from ``df`` if it supports ``__dataframe__``
     """
@@ -46,7 +47,7 @@ def from_dataframe(df : DataFrameObject) -> pd.DataFrame:
     if not hasattr(df, '__dataframe__'):
         raise ValueError("`df` does not support __dataframe__")
 
-    return _from_dataframe(df.__dataframe__())
+    return _from_dataframe(df.__dataframe__(allow_copy=allow_copy))
 
 
 def _from_dataframe(df : DataFrameObject) -> pd.DataFrame:
@@ -196,7 +197,7 @@ def convert_string_column(col : ColumnObject) -> np.ndarray:
             v = mbuf[i/8]
             if null_value == 1:
                 v = ~v
-            
+
             if v & (1<<(i%8)):
                 str_list.append(np.nan)
                 continue
@@ -221,7 +222,8 @@ def convert_string_column(col : ColumnObject) -> np.ndarray:
     return np.asarray(str_list, dtype="object")
 
 
-def __dataframe__(cls, nan_as_null : bool = False) -> dict:
+def __dataframe__(cls, nan_as_null : bool = False,
+                  allow_copy : bool = True) -> dict:
     """
     The public method to attach to pd.DataFrame.
 
@@ -232,8 +234,14 @@ def __dataframe__(cls, nan_as_null : bool = False) -> dict:
     producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
     This currently has no effect; once support for nullable extension
     dtypes is added, this value should be propagated to columns.
+
+    ``allow_copy`` is a keyword that defines if the given implementation
+    is going to support striding buffers. It is optional, and the libraries
+    do not need to implement it. Currently, if the flag is set to ``True`` it
+    will raise a ``RuntimeError``.
     """
-    return _PandasDataFrame(cls, nan_as_null=nan_as_null)
+    return _PandasDataFrame(
+        cls, nan_as_null=nan_as_null, allow_copy=allow_copy)
 
 
 # Monkeypatch the Pandas DataFrame class to support the interchange protocol
@@ -248,16 +256,16 @@ class _PandasBuffer:
     Data in the buffer is guaranteed to be contiguous in memory.
     """
 
-    def __init__(self, x : np.ndarray) -> None:
+    def __init__(self, x : np.ndarray, allow_copy : bool = True) -> None:
         """
         Handle only regular columns (= numpy arrays) for now.
         """
-        if not x.strides == (x.dtype.itemsize,):
-            # Array is not contiguous - this is possible to get in Pandas,
-            # there was some discussion on whether to support it. Som extra
-            # complexity for libraries that don't support it (e.g. Arrow),
-            # but would help with numpy-based libraries like Pandas.
-            raise RuntimeError("Design needs fixing - non-contiguous buffer")
+        if not allow_copy:
+            # Array is not contiguous and strided buffers do not need to be
+            # supported. It brings some extra complexity for libraries that
+            # don't support it (e.g. Arrow).
+            raise RuntimeError(
+                "Exports cannot be zero-copy in the case of a non-contiguous buffer")
 
         # Store the numpy array in which the data resides as a private
         # attribute, so we can use it to retrieve the public attributes
@@ -313,7 +321,8 @@ class _PandasColumn:
 
     """
 
-    def __init__(self, column : pd.Series) -> None:
+    def __init__(self, column : pd.Series,
+                 allow_copy : bool = True) -> None:
         """
         Note: doesn't deal with extension arrays yet, just assume a regular
         Series/ndarray for now.
@@ -324,6 +333,7 @@ def __init__(self, column : pd.Series) -> None:
 
         # Store the column as a private attribute
         self._col = column
+        self._allow_copy = allow_copy
 
     @property
     def size(self) -> int:
@@ -553,11 +563,13 @@ def _get_data_buffer(self) -> Tuple[_PandasBuffer, Any]:  # Any is for self.dtyp
         """
         _k = _DtypeKind
         if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
-            buffer = _PandasBuffer(self._col.to_numpy())
+            buffer = _PandasBuffer(
+                self._col.to_numpy(), allow_copy=self._allow_copy)
             dtype = self.dtype
         elif self.dtype[0] == _k.CATEGORICAL:
             codes = self._col.values.codes
-            buffer = _PandasBuffer(codes)
+            buffer = _PandasBuffer(
+                codes, allow_copy=self._allow_copy)
             dtype = self._dtype_from_pandasdtype(codes.dtype)
         elif self.dtype[0] == _k.STRING:
             # Marshal the strings from a NumPy object array into a byte array
@@ -670,7 +682,8 @@ class _PandasDataFrame:
     ``pd.DataFrame.__dataframe__`` as objects with the methods and
     attributes defined on this class.
     """
-    def __init__(self, df : pd.DataFrame, nan_as_null : bool = False) -> None:
+    def __init__(self, df : pd.DataFrame, nan_as_null : bool = False,
+                 allow_copy : bool = True) -> None:
         """
         Constructor - an instance of this (private) class is returned from
         `pd.DataFrame.__dataframe__`.
@@ -681,6 +694,7 @@ def __init__(self, df : pd.DataFrame, nan_as_null : bool = False) -> None:
         # This currently has no effect; once support for nullable extension
         # dtypes is added, this value should be propagated to columns.
         self._nan_as_null = nan_as_null
+        self._allow_copy = allow_copy
 
     def num_columns(self) -> int:
         return len(self._df.columns)
@@ -695,13 +709,16 @@ def column_names(self) -> Iterable[str]:
         return self._df.columns.tolist()
 
     def get_column(self, i: int) -> _PandasColumn:
-        return _PandasColumn(self._df.iloc[:, i])
+        return _PandasColumn(
+            self._df.iloc[:, i], allow_copy=self._allow_copy)
 
     def get_column_by_name(self, name: str) -> _PandasColumn:
-        return _PandasColumn(self._df[name])
+        return _PandasColumn(
+            self._df[name], allow_copy=self._allow_copy)
 
     def get_columns(self) -> Iterable[_PandasColumn]:
-        return [_PandasColumn(self._df[name]) for name in self._df.columns]
+        return [_PandasColumn(self._df[name], allow_copy=self._allow_copy)
+                for name in self._df.columns]
 
     def select_columns(self, indices: Sequence[int]) -> '_PandasDataFrame':
         if not isinstance(indices, collections.Sequence):
@@ -739,13 +756,12 @@ def test_mixed_intfloat():
 
 
 def test_noncontiguous_columns():
-    # Currently raises: TBD whether it should work or not, see code comment
-    # where the RuntimeError is raised.
+    # Currently raises if the flag of allow zero copy is True.
     arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
     df = pd.DataFrame(arr)
     assert df[0].to_numpy().strides == (24,)
-    pytest.raises(RuntimeError, from_dataframe, df)
-    #df2 = from_dataframe(df)
+    with pytest.raises(RuntimeError):
+        df2 = from_dataframe(df, allow_copy=False)
 
 
 def test_categorical_dtype():

From 0471577dbca6f72a4ca0013b0148e60de1af532e Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Mon, 23 Aug 2021 21:51:18 +0200
Subject: [PATCH 2/5] Improve the `allow_copy` test case

---
 protocol/pandas_implementation.py | 34 ++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py
index e4490d03..fd7c2472 100644
--- a/protocol/pandas_implementation.py
+++ b/protocol/pandas_implementation.py
@@ -235,10 +235,12 @@ def __dataframe__(cls, nan_as_null : bool = False,
     This currently has no effect; once support for nullable extension
     dtypes is added, this value should be propagated to columns.
 
-    ``allow_copy`` is a keyword that defines if the given implementation
-    is going to support striding buffers. It is optional, and the libraries
-    do not need to implement it. Currently, if the flag is set to ``True`` it
-    will raise a ``RuntimeError``.
+    ``allow_copy`` is a keyword that defines whether or not the library is
+    allowed to make a copy of the data. This can for example happen if a
+    library supports strided buffers (those would need a copy, because this
+    protocol specifies contiguous buffers).
+    Currently, if the flag is set to ``False`` and a copy is needed, a
+    ``RuntimeError`` will be raised.
     """
     return _PandasDataFrame(
         cls, nan_as_null=nan_as_null, allow_copy=allow_copy)
@@ -260,12 +262,14 @@ def __init__(self, x : np.ndarray, allow_copy : bool = True) -> None:
         """
         Handle only regular columns (= numpy arrays) for now.
         """
-        if not allow_copy:
-            # Array is not contiguous and strided buffers do not need to be
-            # supported. It brings some extra complexity for libraries that
-            # don't support it (e.g. Arrow).
-            raise RuntimeError(
-                "Exports cannot be zero-copy in the case of a non-contiguous buffer")
+        if not x.strides == (x.dtype.itemsize,):
+            # The protocol does not support strided buffers, so a copy is
+            # necessary. If that's not allowed, we need to raise an exception.
+            if allow_copy:
+                x = x.copy()
+            else:
+                raise RuntimeError("Exports cannot be zero-copy in the case "
+                                   "of a non-contiguous buffer")
 
         # Store the numpy array in which the data resides as a private
         # attribute, so we can use it to retrieve the public attributes
@@ -756,12 +760,14 @@ def test_mixed_intfloat():
 
 
 def test_noncontiguous_columns():
-    # Currently raises if the flag of allow zero copy is True.
     arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
-    df = pd.DataFrame(arr)
-    assert df[0].to_numpy().strides == (24,)
+    df = pd.DataFrame(arr, columns=['a', 'b', 'c'])
+    assert df['a'].to_numpy().strides == (24,)
+    df2 = from_dataframe(df)  # uses default of allow_copy=True
+    tm.assert_frame_equal(df, df2)
+
     with pytest.raises(RuntimeError):
-        df2 = from_dataframe(df, allow_copy=False)
+        from_dataframe(df, allow_copy=False)
 
 
 def test_categorical_dtype():

From c7a0bbbdeb14caef5c7688f90265a979b3e43a86 Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Mon, 23 Aug 2021 21:52:04 +0200
Subject: [PATCH 3/5] Commit bug fix for `allow_copy` behavior

---
 protocol/pandas_implementation.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py
index fd7c2472..a7d4862f 100644
--- a/protocol/pandas_implementation.py
+++ b/protocol/pandas_implementation.py
@@ -68,7 +68,9 @@ def _from_dataframe(df : DataFrameObject) -> pd.DataFrame:
         col = df.get_column_by_name(name)
         if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
             # Simple numerical or bool dtype, turn into numpy array
-            columns[name] = convert_column_to_ndarray(col)
+            # FIXME: .copy() not desired - see NOTE in buffer_to_ndarray for
+            # why it's needed (memory not owned)
+            columns[name] = convert_column_to_ndarray(col).copy()
         elif col.dtype[0] == _k.CATEGORICAL:
             columns[name] = convert_categorical_column(col)
         elif col.dtype[0] == _k.STRING:

From 549a67538f42f4d37d6065bfc9317d2700f3544c Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Mon, 23 Aug 2021 22:21:13 +0200
Subject: [PATCH 4/5] Hold on to memory buffers when constructing a Pandas
 dataframe

---
 protocol/pandas_implementation.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py
index a7d4862f..c4fb9657 100644
--- a/protocol/pandas_implementation.py
+++ b/protocol/pandas_implementation.py
@@ -64,21 +64,24 @@ def _from_dataframe(df : DataFrameObject) -> pd.DataFrame:
     # least for now, deal with non-numpy dtypes later).
     columns = dict()
     _k = _DtypeKind
+    _buffers = []  # hold on to buffers, keeps memory alive
     for name in df.column_names():
         col = df.get_column_by_name(name)
         if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
             # Simple numerical or bool dtype, turn into numpy array
-            # FIXME: .copy() not desired - see NOTE in buffer_to_ndarray for
-            # why it's needed (memory not owned)
-            columns[name] = convert_column_to_ndarray(col).copy()
+            columns[name], _buf = convert_column_to_ndarray(col)
         elif col.dtype[0] == _k.CATEGORICAL:
-            columns[name] = convert_categorical_column(col)
+            columns[name], _buf = convert_categorical_column(col)
         elif col.dtype[0] == _k.STRING:
-            columns[name] = convert_string_column(col)
+            columns[name], _buf = convert_string_column(col)
         else:
             raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet")
 
-    return pd.DataFrame(columns)
+        _buffers.append(_buf)
+
+    df_new = pd.DataFrame(columns)
+    df_new._buffers = _buffers
+    return df_new
 
 
 class _DtypeKind(enum.IntEnum):
@@ -103,7 +106,7 @@ def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray:
                                   "sentinel values not handled yet")
 
     _buffer, _dtype = col.get_buffers()["data"]
-    return buffer_to_ndarray(_buffer, _dtype)
+    return buffer_to_ndarray(_buffer, _dtype), _buffer
 
 
 def buffer_to_ndarray(_buffer, _dtype) -> np.ndarray:
@@ -162,7 +165,7 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series:
         raise NotImplementedError("Only categorical columns with sentinel "
                                   "value supported at the moment")
 
-    return series
+    return series, codes_buffer
 
 
 def convert_string_column(col : ColumnObject) -> np.ndarray:
@@ -221,7 +224,7 @@ def convert_string_column(col : ColumnObject) -> np.ndarray:
         str_list.append(s)
 
     # Convert the string list to a NumPy array
-    return np.asarray(str_list, dtype="object")
+    return np.asarray(str_list, dtype="object"), buffers
 
 
 def __dataframe__(cls, nan_as_null : bool = False,
@@ -250,6 +253,7 @@ def __dataframe__(cls, nan_as_null : bool = False,
 
 # Monkeypatch the Pandas DataFrame class to support the interchange protocol
 pd.DataFrame.__dataframe__ = __dataframe__
+pd.DataFrame._buffers = []
 
 
 # Implementation of interchange protocol

From dd383027914612853c749c914bd87a71526cdb95 Mon Sep 17 00:00:00 2001
From: Athan <kgryte@gmail.com>
Date: Mon, 23 Aug 2021 16:18:51 -0700
Subject: [PATCH 5/5] Update copy

---
 protocol/dataframe_protocol.py    | 6 +++---
 protocol/pandas_implementation.py | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py
index 845c2252..9a36b81c 100644
--- a/protocol/dataframe_protocol.py
+++ b/protocol/dataframe_protocol.py
@@ -358,9 +358,9 @@ def __dataframe__(self, nan_as_null : bool = False,
         mask or byte mask that is the producer's native representation.
 
         ``allow_copy`` is a keyword that defines whether or not the library is
-        allowed to make a copy of the data. This can for example happen if a
-        library supports strided buffers (those would need a copy, because this
-        protocol specifies contiguous buffers).
+        allowed to make a copy of the data. For example, copying data would be
+        necessary if a library supports strided buffers, given that this protocol
+        specifies contiguous buffers.
         """
         self._nan_as_null = nan_as_null
         self._allow_zero_zopy = allow_copy
diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py
index c4fb9657..b61250a3 100644
--- a/protocol/pandas_implementation.py
+++ b/protocol/pandas_implementation.py
@@ -241,9 +241,9 @@ def __dataframe__(cls, nan_as_null : bool = False,
     dtypes is added, this value should be propagated to columns.
 
     ``allow_copy`` is a keyword that defines whether or not the library is
-    allowed to make a copy of the data. This can for example happen if a
-    library supports strided buffers (those would need a copy, because this
-    protocol specifies contiguous buffers).
+    allowed to make a copy of the data. For example, copying data would be
+    necessary if a library supports strided buffers, given that this protocol
+    specifies contiguous buffers.
     Currently, if the flag is set to ``False`` and a copy is needed, a
     ``RuntimeError`` will be raised.
     """