diff --git a/pandas/core/arrays/_mask.py b/pandas/core/arrays/_mask.py new file mode 100644 index 0000000000000..7b908f519be95 --- /dev/null +++ b/pandas/core/arrays/_mask.py @@ -0,0 +1,103 @@ +import numpy as np + + +class NAMask(): + """Generic class which can be used to represent missing data. + + Will use bitarray if available; otherwise will use numpy.""" + + def __init__(self, mask): + """ + Parameters + ---------- + mask : numpy array + Mask of missing values. + """ + + self._has_bitarray = False + try: + import bitarray + globals()['bitarray'] = bitarray + self._has_bitarray = True + self._data = self._numpy_to_bitarray(mask) + except (ImportError, ModuleNotFoundError): + self._data = mask.astype(bool, copy=False) + + def _numpy_to_bitarray(self, arr): + bit_arr = bitarray() + bit_arr.pack(arr.astype(bool, copy=False)) + + def _bitarray_to_numpy(self, arr): + return np.fromstring(arr.unpack(), dtype=bool) + + def __getitem__(self, item): + if self._has_bitarray: + raise NotImplementedError + + return self._data[item] + + def __setitem__(self, key, value): + if self._has_bitarray: + raise NotImplementedError + + self._data[key] = value + + def __array__(self): + if self._has_bitarray: + raise NotImplementedError + + return self._data + + def __iter__(self): + for i in range(len(self._data)): + yield self._data[i] + + def __invert__(self): + if self._has_bitarray: + raise NotImplementedError + + return type(self)(~self._data) + + def __or__(self, other): + if self._has_bitarray: + raise NotImplementedError + + return self._data.__or__(other) + + def __ior__(self, other): + if self._has_bitarray: + raise NotImplementedError + + return self._data | other + + @property + def nbytes(self): + if self._has_bitarray: + return self._data.buffer_info()[1] + + return self._data.nbytes + + @property + def size(self): + if self._has_bitarray: + raise NotImplementedError + + return self._data.size + + def astype(self, dtype, copy=False): + if self._has_bitarray: + raise NotImplementedError + + return self._data.astype(dtype, copy=copy) + + def any(self): + return self._data.any() + + def copy(self): + return type(self)(self._data.copy()) + + def sum(self): + if self._has_bitarray: + raise NotImplementedError + + return self._data.sum() diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index af2c05bbee7c2..1daf96ab45489 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -19,6 +19,7 @@ from pandas.core import nanops from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +from pandas.core.arrays._mask import NAMask from pandas.core.tools.numeric import to_numeric @@ -247,7 +248,8 @@ def __init__(self, values, mask, copy=False): and is_integer_dtype(values.dtype)): raise TypeError("values should be integer numpy array. Use " "the 'integer_array' function instead") - if not (isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype)): + if not (isinstance(mask, NAMask) or ( + isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype))): raise TypeError("mask should be boolean numpy array. Use " "the 'integer_array' function instead") @@ -256,7 +258,7 @@ def __init__(self, values, mask, copy=False): mask = mask.copy() self._data = values - self._mask = mask + self._mask = NAMask(mask) @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): @@ -283,6 +285,7 @@ def __getitem__(self, item): if self._mask[item]: return self.dtype.na_value return self._data[item] + return type(self)(self._data[item], self._mask[item]) def _coerce_to_ndarray(self):