-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
EA: BoolArray #25415
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
EA: BoolArray #25415
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
_MaskArrayType = None | ||
|
||
|
||
def get_mask_array_type(): | ||
"""Set the mask array type to use, we need to do | ||
this after all modules are imported as the implementations | ||
e.g. pyarrow depend on pandas being importable | ||
""" | ||
global _MaskArrayType | ||
|
||
if _MaskArrayType is not None: | ||
return _MaskArrayType | ||
|
||
# if ArrowBoolArray is available use it | ||
# otherwise use the NumpyMask | ||
try: | ||
from pandas.core.arrays.mask._pyarrow import ArrowMaskArray | ||
|
||
MaskArray = ArrowMaskArray | ||
|
||
except ImportError: | ||
from pandas.core.arrays.mask._numpy import NumpyMaskArray | ||
|
||
MaskArray = NumpyMaskArray | ||
|
||
_MaskArrayType = MaskArray | ||
return _MaskArrayType | ||
|
||
|
||
__all__ = ['get_mask_array_type'] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
"""A boolean mask interface. | ||
|
||
This module provides an interface to a numpy / pyarrow boolean mask. | ||
This is limited as not all of the implementations can hold NA, so | ||
for consistency this is an internal. | ||
""" | ||
|
||
import copy | ||
|
||
import numpy as np | ||
|
||
from pandas.api.extensions import ExtensionDtype | ||
from pandas.api.types import is_scalar | ||
from pandas.core.arrays.base import ExtensionArray | ||
from pandas.core.missing import isna | ||
|
||
|
||
class MaskDtype(ExtensionDtype): | ||
|
||
type = np.bool_ | ||
kind = 'b' | ||
name = 'bool' | ||
|
||
@classmethod | ||
def construct_from_string(cls, string): | ||
if string == cls.name: | ||
return cls() | ||
else: | ||
raise TypeError("Cannot construct a '{}' from " | ||
"'{}'".format(cls, string)) | ||
|
||
def _is_boolean(self): | ||
return True | ||
|
||
def __hash__(self): | ||
return hash(str(self)) | ||
|
||
def __eq__(self, other): | ||
# compare == to np.dtype('bool') | ||
if isinstance(other, str): | ||
return other == self.name | ||
elif isinstance(other, type(self)): | ||
return True | ||
elif isinstance(other, np.dtype): | ||
return other == 'bool' | ||
else: | ||
return hash(self) == hash(other) | ||
|
||
|
||
class MaskArray(ExtensionArray): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. More of a question than anything else but is there a reason for defining a lot of the methods in this base class rather than in subclasses? Not terribly familiar with pyarrow yet but would the goal not be to decouple that from numpy here in the long run? |
||
"""Common baseclass for both pyarrow and numpy masked arrays""" | ||
_typ = "maskarray" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will this make |
||
|
||
@classmethod | ||
def _from_sequence(cls, scalars, dtype=None, copy=False): | ||
return cls.from_scalars(scalars) | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
@property | ||
def size(self): | ||
return len(self) | ||
|
||
def __eq__(self, other): | ||
return np.array(self, copy=False) == np.array(other, copy=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. May be good to keep a list of things requiring a cast to NumPy at the top of this file. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And perhaps make JIRAs for them as well. This, |
||
|
||
def __len__(self): | ||
return len(self._data) | ||
|
||
def isna(self): | ||
nas = isna(np.array(self._data, copy=False)) | ||
return type(self).from_scalars(nas) | ||
|
||
def __invert__(self): | ||
return type(self).from_scalars( | ||
~np.array(self._data, copy=False) | ||
) | ||
|
||
def __or__(self, other): | ||
return type(self).from_scalars(np.array( | ||
self, copy=False).__or__(np.array(other, copy=False))) | ||
|
||
def __ior__(self, other): | ||
return type(self).from_scalars( | ||
np.array(self, copy=False) | np.array(other, copy=False)) | ||
|
||
def __and__(self, other): | ||
return type(self).from_scalars( | ||
np.array(self, copy=False).__and__(np.array(other, copy=False))) | ||
|
||
def __iand__(self, other): | ||
return type(self).from_scalars( | ||
np.array(self, copy=False) & (np.array(other, copy=False))) | ||
|
||
def __getitem__(self, item): | ||
arr = np.array(self, copy=False) | ||
if is_scalar(item): | ||
return arr[item] | ||
else: | ||
arr = arr[item] | ||
return type(self).from_scalars(arr) | ||
|
||
def view(self, dtype=None): | ||
arr = np.array(self._data, copy=False) | ||
if dtype is not None: | ||
arr = arr.view(dtype=dtype) | ||
return arr | ||
|
||
def sum(self, axis=None, min_count=None): | ||
return np.array(self, copy=False).sum() | ||
|
||
def copy(self, deep=False): | ||
if deep: | ||
return type(self)(copy.deepcopy(self._data)) | ||
else: | ||
return type(self)(copy.copy(self._data)) | ||
|
||
def any(self, axis=0, out=None): | ||
return np.array(self._data, copy=False).any() | ||
|
||
def all(self, axis=0, out=None): | ||
return np.array(self._data, copy=False).all() | ||
|
||
def min(self, axis=0, out=None): | ||
return np.array(self._data, copy=False).min() | ||
|
||
def max(self, axis=0, out=None): | ||
return np.array(self._data, copy=False).max() | ||
|
||
def _reduce(self, method, skipna=True, **kwargs): | ||
if skipna: | ||
arr = self[~self.isna()] | ||
else: | ||
arr = self | ||
# we only allow explicity defined methods | ||
# ndarrays actually support: mean, var, prod, min, max | ||
try: | ||
op = getattr(arr, method) | ||
return op() | ||
except AttributeError: | ||
pass | ||
raise TypeError |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
""" | ||
This module provide a numpy-boolean boolean array | ||
""" | ||
|
||
import numpy as np | ||
|
||
from pandas.api.extensions import take | ||
from pandas.core.arrays.mask._base import MaskArray, MaskDtype | ||
|
||
|
||
class NumpyMaskDtype(MaskDtype): | ||
|
||
na_value = np.nan | ||
|
||
@classmethod | ||
def construct_array_type(cls): | ||
return NumpyMaskArray | ||
|
||
|
||
class NumpyMaskArray(MaskArray): | ||
"""Generic class which can be used to represent missing data. | ||
""" | ||
|
||
dtype = NumpyMaskDtype() | ||
|
||
@classmethod | ||
def from_scalars(cls, values): | ||
arr = np.asarray(values).astype(np.bool_, copy=False) | ||
return cls(arr, copy=False) | ||
|
||
def __init__(self, mask, copy=True): | ||
""" | ||
Parameters | ||
---------- | ||
mask : numpy array | ||
Mask of missing values. | ||
""" | ||
assert isinstance(mask, np.ndarray) | ||
assert mask.dtype == np.bool_ | ||
|
||
if copy: | ||
mask = mask.copy() | ||
self._data = mask | ||
|
||
def __setitem__(self, key, value): | ||
self._data[key] = value | ||
|
||
def __array__(self, dtype=None): | ||
return self._data | ||
|
||
def __iter__(self): | ||
return iter(self._data) | ||
|
||
@property | ||
def nbytes(self): | ||
return self._data.nbytes | ||
|
||
def reshape(self, shape, **kwargs): | ||
return np.array(self, copy=False).reshape(shape, **kwargs) | ||
|
||
def astype(self, dtype, copy=True): | ||
# needed to fix this astype for the Series constructor. | ||
if isinstance(dtype, type(self.dtype)) and dtype == self.dtype: | ||
if copy: | ||
return self.copy() | ||
return self | ||
return super(NumpyMaskArray, self).astype(dtype, copy) | ||
|
||
def take(self, indices, allow_fill=False, fill_value=None, axis=None): | ||
# TODO: had to add axis here | ||
data = self._data | ||
|
||
if allow_fill and fill_value is None: | ||
fill_value = self.dtype.na_value | ||
|
||
result = take(data, indices, fill_value=fill_value, | ||
allow_fill=allow_fill) | ||
return self._from_sequence(result, dtype=self.dtype) | ||
|
||
def _concat_same_type(cls, to_concat): | ||
concat = np.concatenate(to_concat) | ||
return cls.from_scalars(concat) |
Uh oh!
There was an error while loading. Please reload this page.