2
2
import enum
3
3
from abc import ABC , abstractmethod
4
4
5
+
5
6
class DlpackDeviceType (enum .IntEnum ):
6
7
"""Integer enum for device type codes matching DLPack."""
7
8
@@ -14,6 +15,7 @@ class DlpackDeviceType(enum.IntEnum):
14
15
VPI = 9
15
16
ROCM = 10
16
17
18
+
17
19
class DtypeKind (enum .IntEnum ):
18
20
"""
19
21
Integer enum for data types.
@@ -44,6 +46,7 @@ class DtypeKind(enum.IntEnum):
44
46
DATETIME = 22
45
47
CATEGORICAL = 23
46
48
49
+
47
50
class ColumnNullType (enum .IntEnum ):
48
51
"""
49
52
Integer enum for null type representation.
@@ -68,6 +71,7 @@ class ColumnNullType(enum.IntEnum):
68
71
USE_BITMASK = 3
69
72
USE_BYTEMASK = 4
70
73
74
+
71
75
class ColumnBuffers (TypedDict ):
72
76
data : Tuple ["Buffer" , Any ] # first element is a buffer containing the column data;
73
77
# second element is the data buffer's associated dtype
@@ -86,11 +90,13 @@ class ColumnBuffers(TypedDict):
86
90
class Buffer (ABC ):
87
91
"""
88
92
Data in the buffer is guaranteed to be contiguous in memory.
93
+
89
94
Note that there is no dtype attribute present, a buffer can be thought of
90
95
as simply a block of memory. However, if the column that the buffer is
91
96
attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
92
97
implemented, then that dtype information will be contained in the return
93
98
value from ``__dlpack__``.
99
+
94
100
This distinction is useful to support both data exchange via DLPack on a
95
101
buffer and (b) dtypes like variable-length strings which do not have a
96
102
fixed number of bytes per element.
@@ -116,9 +122,12 @@ def ptr(self) -> int:
116
122
def __dlpack__ (self ):
117
123
"""
118
124
Produce DLPack capsule (see array API standard).
125
+
119
126
Raises:
127
+
120
128
- TypeError : if the buffer contains unsupported dtypes.
121
129
- NotImplementedError : if DLPack support is not implemented
130
+
122
131
Useful to have to connect to array libraries. Support optional because
123
132
it's not completely trivial to implement for a Python-only library.
124
133
"""
@@ -138,27 +147,33 @@ class Column(ABC):
138
147
"""
139
148
A column object, with only the methods and properties required by the
140
149
interchange protocol defined.
150
+
141
151
A column can contain one or more chunks. Each chunk can contain up to three
142
152
buffers - a data buffer, a mask buffer (depending on null representation),
143
153
and an offsets buffer (if variable-size binary; e.g., variable-length
144
154
strings).
155
+
145
156
TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
146
157
Instead, it seems to use "children" for both columns with a bit mask,
147
158
and for nested dtypes. Unclear whether this is elegant or confusing.
148
159
This design requires checking the null representation explicitly.
160
+
149
161
The Arrow design requires checking:
150
162
1. the ARROW_FLAG_NULLABLE (for sentinel values)
151
163
2. if a column has two children, combined with one of those children
152
164
having a null dtype.
165
+
153
166
Making the mask concept explicit seems useful. One null dtype would
154
167
not be enough to cover both bit and byte masks, so that would mean
155
168
even more checking if we did it the Arrow way.
169
+
156
170
TBD: there's also the "chunk" concept here, which is implicit in Arrow as
157
171
multiple buffers per array (= column here). Semantically it may make
158
172
sense to have both: chunks were meant for example for lazy evaluation
159
173
of data which doesn't fit in memory, while multiple buffers per column
160
174
could also come from doing a selection operation on a single
161
175
contiguous buffer.
176
+
162
177
Given these concepts, one would expect chunks to be all of the same
163
178
size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),
164
179
while multiple buffers could have data-dependent lengths. Not an issue
@@ -167,6 +182,7 @@ class Column(ABC):
167
182
Are multiple chunks *and* multiple buffers per column necessary for
168
183
the purposes of this interchange protocol, or must producers either
169
184
reuse the chunk concept for this or copy the data?
185
+
170
186
Note: this Column object can only be produced by ``__dataframe__``, so
171
187
doesn't need its own version or ``__column__`` protocol.
172
188
"""
@@ -176,6 +192,7 @@ class Column(ABC):
176
192
def size (self ) -> Optional [int ]:
177
193
"""
178
194
Size of the column, in elements.
195
+
179
196
Corresponds to DataFrame.num_rows() if column is a single chunk;
180
197
equal to size of this current chunk otherwise.
181
198
"""
@@ -186,6 +203,7 @@ def size(self) -> Optional[int]:
186
203
def offset (self ) -> int :
187
204
"""
188
205
Offset of first element.
206
+
189
207
May be > 0 if using chunks; for example for a column with N chunks of
190
208
equal size M (only the last chunk may be shorter),
191
209
``offset = n * M``, ``n = 0 .. N-1``.
@@ -197,10 +215,12 @@ def offset(self) -> int:
197
215
def dtype (self ) -> Tuple [DtypeKind , int , str , str ]:
198
216
"""
199
217
Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
218
+
200
219
Bit-width : the number of bits as an integer
201
220
Format string : data type description format string in Apache Arrow C
202
221
Data Interface format.
203
222
Endianness : current only native endianness (``=``) is supported
223
+
204
224
Notes:
205
225
- Kind specifiers are aligned with DLPack where possible (hence the
206
226
jump to 20, leave enough room for future extension)
@@ -229,6 +249,7 @@ def describe_categorical(self) -> Tuple[bool, bool, Optional[dict]]:
229
249
If the dtype is categorical, there are two options:
230
250
- There are only values in the data buffer.
231
251
- There is a separate dictionary-style encoding for categorical values.
252
+
232
253
Raises TypeError if the dtype is not categorical
233
254
234
255
Returns the description on how to interpret the data buffer:
@@ -238,6 +259,7 @@ def describe_categorical(self) -> Tuple[bool, bool, Optional[dict]]:
238
259
categorical values to other objects exists
239
260
- "mapping" : dict, Python-level only (e.g. ``{int: str}``).
240
261
None if not a dictionary-style categorical.
262
+
241
263
TBD: are there any other in-memory representations that are needed?
242
264
"""
243
265
pass
@@ -248,6 +270,7 @@ def describe_null(self) -> Tuple[ColumnNullType, Any]:
248
270
"""
249
271
Return the missing value (or "null") representation the column dtype
250
272
uses, as a tuple ``(kind, value)``.
273
+
251
274
Value : if kind is "sentinel value", the actual value. If kind is a bit
252
275
mask or a byte mask, the value (0 or 1) indicating a missing value. None
253
276
otherwise.
@@ -259,6 +282,7 @@ def describe_null(self) -> Tuple[ColumnNullType, Any]:
259
282
def null_count (self ) -> Optional [int ]:
260
283
"""
261
284
Number of null elements, if known.
285
+
262
286
Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.
263
287
"""
264
288
pass
@@ -282,6 +306,7 @@ def num_chunks(self) -> int:
282
306
def get_chunks (self , n_chunks : Optional [int ] = None ) -> Iterable ["Column" ]:
283
307
"""
284
308
Return an iterator yielding the chunks.
309
+
285
310
See `DataFrame.get_chunks` for details on ``n_chunks``.
286
311
"""
287
312
pass
@@ -290,7 +315,9 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["Column"]:
290
315
def get_buffers (self ) -> ColumnBuffers :
291
316
"""
292
317
Return a dictionary containing the underlying buffers.
318
+
293
319
The returned dictionary has the following contents:
320
+
294
321
- "data": a two-element tuple whose first element is a buffer
295
322
containing the data and whose second element is the data
296
323
buffer's associated dtype.
@@ -320,14 +347,17 @@ class DataFrame(ABC):
320
347
"""
321
348
A data frame class, with only the methods required by the interchange
322
349
protocol defined.
350
+
323
351
A "data frame" represents an ordered collection of named columns.
324
352
A column's "name" must be a unique string.
325
353
Columns may be accessed by name or by position.
354
+
326
355
This could be a public data frame class, or an object with the methods and
327
356
attributes defined on this DataFrame class could be returned from the
328
357
``__dataframe__`` method of a public data frame class in a library adhering
329
358
to the dataframe interchange protocol specification.
330
359
"""
360
+
331
361
version = 0 # version of the protocol
332
362
333
363
@property
@@ -414,6 +444,7 @@ def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame":
414
444
def get_chunks (self , n_chunks : Optional [int ] = None ) -> Iterable ["DataFrame" ]:
415
445
"""
416
446
Return an iterator yielding the chunks.
447
+
417
448
By default (None), yields the chunks that the data is stored as by the
418
449
producer. If given, ``n_chunks`` must be a multiple of
419
450
``self.num_chunks()``, meaning the producer must subdivide each chunk
0 commit comments