Skip to content

Commit 223d32e

Browse files
committed
create NumpyBoolArray and ArrayBoolArrayk subclasses for MaskArray
1 parent 65275fd commit 223d32e

File tree

14 files changed

+297
-128
lines changed

14 files changed

+297
-128
lines changed

pandas/core/arrays/_mask.py

Lines changed: 88 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,103 +1,136 @@
1+
"""A boolean mask interace.
2+
3+
This module provide a numpy-like mask; it can be optionally backed
4+
by pyarrow.
5+
"""
6+
17
import numpy as np
8+
from pandas.core.base import PandasObject
29

310

4-
class NAMask():
11+
# TODO: The NumpyBoolArray should actually be a full EA.
12+
13+
class NumpyBoolArray(PandasObject):
514
"""Generic class which can be used to represent missing data.
15+
"""
16+
_typ = "maskarray"
617

7-
Will use bitarray if available; otherwise will use numpy."""
18+
@property
19+
def dtype(self):
20+
return np.dtype('bool')
821

9-
def __init__(self, mask):
22+
def __init__(self, mask, copy=True):
1023
"""
1124
Parameters
1225
----------
1326
mask : numpy array
1427
Mask of missing values.
1528
"""
16-
17-
self._has_bitarray = False
18-
try:
19-
import bitarray
20-
globals()['bitarray'] = bitarray
21-
self._has_bitarray = True
22-
self._data = self._numpy_to_bitarray(mask)
23-
except (ImportError, ModuleNotFoundError):
24-
self._data = mask.astype(bool, copy=False)
25-
26-
def _numpy_to_bitarray(self, arr):
27-
bit_arr = bitarray()
28-
bit_arr.pack(arr.astype(bool, copy=False))
29-
30-
def _bitarray_to_numpy(self, arr):
31-
return np.fromstring(arr.unpack(), dtype=bool)
32-
33-
def __getitem__(self, item):
34-
if self._has_bitarray:
35-
raise NotImplementedError
36-
37-
return self._data[item]
29+
mask = np.asarray(mask)
30+
self._data = mask.astype(bool, copy=copy)
31+
32+
def __repr__(self):
33+
from pandas.io.formats.printing import format_object_summary
34+
35+
template = (
36+
u'{class_name}'
37+
u'{data}\n'
38+
u'Length: {length}'
39+
)
40+
# the short repr has no trailing newline, while the truncated
41+
# repr does. So we include a newline in our template, and strip
42+
# any trailing newlines from format_object_summary
43+
data = format_object_summary(self, self._formatter(),
44+
indent_for_name=False).rstrip(', \n')
45+
class_name = u'<{}>\n'.format(self.__class__.__name__)
46+
return template.format(class_name=class_name, data=data,
47+
length=len(self))
48+
49+
def _formatter(self, boxed=False):
50+
return repr
51+
52+
def __getitem__(self, key):
53+
return self._data[key]
3854

3955
def __setitem__(self, key, value):
40-
if self._has_bitarray:
41-
raise NotImplementedError
42-
4356
self._data[key] = value
4457

45-
def __array__(self):
46-
if self._has_bitarray:
47-
raise NotImplementedError
58+
def __len__(self):
59+
return len(self._data)
4860

61+
def __eq__(self, other):
62+
return np.array(self, copy=False) == np.array(other, copy=False)
63+
64+
def __array__(self, dtype=None):
4965
return self._data
5066

5167
def __iter__(self):
5268
for i in range(len(self._data)):
5369
yield self._data[i]
5470

5571
def __invert__(self):
56-
if self._has_bitarray:
57-
raise NotImplementedError
58-
5972
return type(self)(~self._data)
6073

6174
def __or__(self, other):
62-
if self._has_bitarray:
63-
raise NotImplementedError
64-
65-
return self._data.__or__(other)
75+
return np.array(
76+
self, copy=False).__or__(np.array(other, copy=False))
6677

6778
def __ior__(self, other):
68-
if self._has_bitarray:
69-
raise NotImplementedError
79+
return np.array(self, copy=False) | np.array(other, copy=False)
7080

71-
return self._data | other
81+
def __and__(self, other):
82+
return np.array(self, copy=False).__and__(np.array(other, copy=False))
83+
84+
def __iand__(self, other):
85+
return np.array(self, copy=False) & (np.array(other, copy=False))
7286

7387
@property
7488
def nbytes(self):
75-
if self._has_bitarray:
76-
return self._data.buffer_info()[1]
77-
7889
return self._data.nbytes
7990

8091
@property
8192
def size(self):
82-
if self._has_bitarray:
83-
raise NotImplementedError
93+
return len(self)
94+
95+
@property
96+
def ndim(self):
97+
return 1
8498

85-
return self._data.size
99+
@property
100+
def shape(self):
101+
return (len(self), )
86102

87-
def astype(self, dtype, copy=False):
88-
if self._has_bitarray:
89-
raise NotImplementedError
103+
def reshape(self, shape, **kwargs):
104+
return np.array(self, copy=False).reshape(shape, **kwargs)
90105

91-
return self._data.astype(dtype, copy=copy)
106+
def astype(self, dtype, copy=False):
107+
return np.array(self, copy=False).astype(dtype, copy=copy)
92108

93109
def any(self):
94110
return self._data.any()
95111

112+
def all(self):
113+
return self._data.all()
114+
96115
def copy(self):
97116
return type(self)(self._data.copy())
98117

99-
def sum(self):
100-
if self._has_bitarray:
101-
raise NotImplementedError
118+
def sum(self, axis=None):
119+
return np.array(self, copy=False).sum()
120+
121+
def take(self, indicies, **kwargs):
122+
return np.array(self, copy=False).take(indicies)
123+
124+
125+
def get_mask_array_type():
126+
# protect imports
127+
128+
# if ArrowBoolArray is available use it
129+
# otherwise use the NumpyMask
130+
try:
131+
from pandas.core.arrays.bool import ArrowBoolArray
132+
MaskArray = ArrowBoolArray
133+
except ImportError:
134+
MaskArray = NumpyBoolArray
102135

103-
return self._data.sum()
136+
return MaskArray

pandas/tests/extension/arrow/bool.py renamed to pandas/core/arrays/bool.py

Lines changed: 64 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import pyarrow as pa
1313

1414
import pandas as pd
15+
from pandas.api.types import is_scalar
1516
from pandas.api.extensions import (
1617
ExtensionArray, ExtensionDtype, register_extension_dtype, take)
1718

@@ -41,17 +42,20 @@ def _is_boolean(self):
4142

4243

4344
class ArrowBoolArray(ExtensionArray):
44-
def __init__(self, values):
45+
def __init__(self, values, copy=False):
4546
if not isinstance(values, pa.ChunkedArray):
46-
raise ValueError
47+
48+
# TODO: hack a minute
49+
values = np.asarray(values).astype(np.bool_, copy=False)
50+
values = pa.chunked_array([values])
51+
else:
52+
if copy:
53+
values = values.copy()
4754

4855
assert values.type == pa.bool_()
4956
self._data = values
5057
self._dtype = ArrowBoolDtype()
5158

52-
def __repr__(self):
53-
return "ArrowBoolArray({})".format(repr(self._data))
54-
5559
@classmethod
5660
def from_scalars(cls, values):
5761
arr = pa.chunked_array([pa.array(np.asarray(values))])
@@ -67,15 +71,24 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
6771
return cls.from_scalars(scalars)
6872

6973
def __getitem__(self, item):
70-
if pd.api.types.is_scalar(item):
74+
if is_scalar(item):
7175
return self._data.to_pandas()[item]
7276
else:
7377
vals = self._data.to_pandas()[item]
7478
return type(self).from_scalars(vals)
7579

80+
def __setitem__(self, key, value):
81+
# TODO: hack-a-minute
82+
data = np.array(self._data)
83+
data[key] = value
84+
self._data = pa.array(data)
85+
7686
def __len__(self):
7787
return len(self._data)
7888

89+
def view(self, dtype):
90+
return np.array(self._data, copy=False).view(dtype)
91+
7992
def astype(self, dtype, copy=True):
8093
# needed to fix this astype for the Series constructor.
8194
if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
@@ -94,11 +107,36 @@ def nbytes(self):
94107
for x in chunk.buffers()
95108
if x is not None)
96109

110+
@property
111+
def size(self):
112+
return len(self)
113+
114+
def __repr__(self):
115+
from pandas.io.formats.printing import format_object_summary
116+
117+
template = (
118+
u'{class_name}'
119+
u'{data}\n'
120+
u'Length: {length}'
121+
)
122+
# the short repr has no trailing newline, while the truncated
123+
# repr does. So we include a newline in our template, and strip
124+
# any trailing newlines from format_object_summary
125+
data = format_object_summary(self, self._formatter(),
126+
indent_for_name=False).rstrip(', \n')
127+
class_name = u'<{}>\n'.format(self.__class__.__name__)
128+
return template.format(class_name=class_name, data=data,
129+
length=len(self))
130+
131+
def __eq__(self, other):
132+
return np.array(self, copy=False) == np.array(other, copy=False)
133+
97134
def isna(self):
98135
nas = pd.isna(self._data.to_pandas())
99136
return type(self).from_scalars(nas)
100137

101-
def take(self, indices, allow_fill=False, fill_value=None):
138+
def take(self, indices, allow_fill=False, fill_value=None, axis=None):
139+
# TODO: had to add axis here
102140
data = self._data.to_pandas()
103141

104142
if allow_fill and fill_value is None:
@@ -114,6 +152,9 @@ def copy(self, deep=False):
114152
else:
115153
return type(self)(copy.copy(self._data))
116154

155+
def sum(self, axis=None):
156+
return np.array(self, copy=False).sum()
157+
117158
def _concat_same_type(cls, to_concat):
118159
chunks = list(itertools.chain.from_iterable(x._data.chunks
119160
for x in to_concat))
@@ -125,6 +166,22 @@ def __invert__(self):
125166
~self._data.to_pandas()
126167
)
127168

169+
def __or__(self, other):
170+
return np.array(
171+
self, copy=False).__or__(np.array(other, copy=False))
172+
173+
def __ior__(self, other):
174+
return np.array(self, copy=False) | np.array(other, copy=False)
175+
176+
def __and__(self, other):
177+
return np.array(self, copy=False).__and__(np.array(other, copy=False))
178+
179+
def __iand__(self, other):
180+
return np.array(self, copy=False) & (np.array(other, copy=False))
181+
182+
def __array__(self, dtype=None):
183+
return np.array(self._data, copy=False)
184+
128185
def _reduce(self, method, skipna=True, **kwargs):
129186
if skipna:
130187
arr = self[~self.isna()]

pandas/core/arrays/integer.py

Lines changed: 6 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,8 @@
1919

2020
from pandas.core import nanops
2121
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
22-
<<<<<<< HEAD
22+
from pandas.core.arrays._mask import get_mask_array_type
2323
from pandas.core.tools.numeric import to_numeric
24-
=======
25-
from pandas.core.arrays._mask import NAMask
26-
>>>>>>> 384287e71... Revert unnecessary changes from master
2724

2825

2926
class _IntegerDtype(ExtensionDtype):
@@ -287,12 +284,14 @@ def dtype(self):
287284
return _dtypes[str(self._data.dtype)]
288285

289286
def __init__(self, values, mask, copy=False):
287+
MaskArray = get_mask_array_type()
288+
290289
if not (isinstance(values, np.ndarray)
291290
and is_integer_dtype(values.dtype)):
292291
raise TypeError("values should be integer numpy array. Use "
293292
"the 'integer_array' function instead")
294-
if not (isinstance(mask, NAMask) or (
295-
isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype))):
293+
if not (isinstance(mask, (np.ndarray, MaskArray)) and
294+
is_bool_dtype(mask.dtype)):
296295
raise TypeError("mask should be boolean numpy array. Use "
297296
"the 'integer_array' function instead")
298297

@@ -301,7 +300,7 @@ def __init__(self, values, mask, copy=False):
301300
mask = mask.copy()
302301

303302
self._data = values
304-
self._mask = NAMask(mask)
303+
self._mask = MaskArray(mask, copy=False)
305304

306305
@classmethod
307306
def _from_sequence(cls, scalars, dtype=None, copy=False):
@@ -328,22 +327,7 @@ def __getitem__(self, item):
328327
if self._mask[item]:
329328
return self.dtype.na_value
330329
return self._data[item]
331-
<<<<<<< HEAD
332-
return type(self)(self._data[item], self._mask[item])
333-
=======
334-
335-
<<<<<<< HEAD
336-
return type(self)(self._data[item],
337-
<<<<<<< HEAD
338-
mask=_bitarray_to_numpy(self._mask)[item],
339-
dtype=self.dtype)
340-
>>>>>>> 2ff4b0907... First pass at implementation (needs refactor)
341-
=======
342-
mask=_bitarray_to_numpy(self._mask)[item])
343-
>>>>>>> e085674ac... Reverted changes; created new module for mask
344-
=======
345330
return type(self)(self._data[item], self._mask[item])
346-
>>>>>>> 384287e71... Revert unnecessary changes from master
347331

348332
def _coerce_to_ndarray(self):
349333
"""

0 commit comments

Comments
 (0)