diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 5ce5ae7186774..d12114bd951ba 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2206,48 +2206,71 @@ def asi8(self): return self.values.view('i8') -class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock): +class DatetimeBlock(DatetimeLikeBlockMixin, Block): __slots__ = () - is_timedelta = True + is_datetime = True _can_hold_na = True - is_numeric = False def __init__(self, values, placement, ndim=None): - if values.dtype != _TD_DTYPE: - values = conversion.ensure_timedelta64ns(values) - if isinstance(values, TimedeltaArray): + values = self._maybe_coerce_values(values) + super(DatetimeBlock, self).__init__(values, + placement=placement, ndim=ndim) + + def _maybe_coerce_values(self, values): + """Input validation for values passed to __init__. Ensure that + we have datetime64ns, coercing if necessary. + + Parameters + ---------- + values : array-like + Must be convertible to datetime64 + + Returns + ------- + values : ndarray[datetime64ns] + + Overridden by DatetimeTZBlock. + """ + if values.dtype != _NS_DTYPE: + values = conversion.ensure_datetime64ns(values) + + if isinstance(values, DatetimeArray): values = values._data + assert isinstance(values, np.ndarray), type(values) - super(TimeDeltaBlock, self).__init__(values, - placement=placement, ndim=ndim) + return values - @property - def _holder(self): - return TimedeltaArray + def _astype(self, dtype, **kwargs): + """ + these automatically copy, so copy=True has no effect + raise on an except if raise == True + """ + dtype = pandas_dtype(dtype) - @property - def _box_func(self): - return lambda x: Timedelta(x, unit='ns') + # if we are passed a datetime64[ns, tz] + if is_datetime64tz_dtype(dtype): + values = self.values + if getattr(values, 'tz', None) is None: + values = DatetimeIndex(values).tz_localize('UTC') + values = values.tz_convert(dtype.tz) + return self.make_block(values) + + # delegate + return super(DatetimeBlock, self)._astype(dtype=dtype, **kwargs) def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: - return issubclass(tipo.type, (np.timedelta64, np.int64)) - return is_integer(element) or isinstance( - element, (timedelta, np.timedelta64, np.int64)) - - def fillna(self, value, **kwargs): - - # allow filling with integers to be - # interpreted as seconds - if is_integer(value) and not isinstance(value, np.timedelta64): - value = Timedelta(value, unit='s') - return super(TimeDeltaBlock, self).fillna(value, **kwargs) + return tipo == _NS_DTYPE or tipo == np.int64 + return (is_integer(element) or isinstance(element, datetime) or + isna(element)) def _try_coerce_args(self, values, other): """ - Coerce values and other to int64, with null values converted to - iNaT. values is always ndarray-like, other may not be + Coerce values and other to dtype 'i8'. NaN and NaT convert to + the smallest i8, and will correctly round-trip to NaT if converted + back in _try_coerce_result. values is always ndarray-like, other + may not be Parameters ---------- @@ -2258,19 +2281,20 @@ def _try_coerce_args(self, values, other): ------- base-type values, base-type other """ + values = values.view('i8') if isinstance(other, bool): raise TypeError elif is_null_datelike_scalar(other): other = tslibs.iNaT - elif isinstance(other, Timedelta): - other = other.value - elif isinstance(other, timedelta): - other = Timedelta(other).value - elif isinstance(other, np.timedelta64): - other = Timedelta(other).value - elif hasattr(other, 'dtype') and is_timedelta64_dtype(other): + elif isinstance(other, (datetime, np.datetime64, date)): + other = self._box_func(other) + if getattr(other, 'tz') is not None: + raise TypeError("cannot coerce a Timestamp with a tz on a " + "naive Block") + other = other.asm8.view('i8') + elif hasattr(other, 'dtype') and is_datetime64_dtype(other): other = other.astype('i8', copy=False).view('i8') else: # coercion issues @@ -2280,549 +2304,345 @@ def _try_coerce_args(self, values, other): return values, other def _try_coerce_result(self, result): - """ reverse of try_coerce_args / try_operate """ + """ reverse of try_coerce_args """ if isinstance(result, np.ndarray): - mask = isna(result) if result.dtype.kind in ['i', 'f', 'O']: - result = result.astype('m8[ns]') - result[mask] = tslibs.iNaT - elif isinstance(result, (np.integer, np.float)): + try: + result = result.astype('M8[ns]') + except ValueError: + pass + elif isinstance(result, (np.integer, np.float, np.datetime64)): result = self._box_func(result) return result - def should_store(self, value): - return (issubclass(value.dtype.type, np.timedelta64) and - not is_extension_array_dtype(value)) + @property + def _box_func(self): + return tslibs.Timestamp - def to_native_types(self, slicer=None, na_rep=None, quoting=None, - **kwargs): + def to_native_types(self, slicer=None, na_rep=None, date_format=None, + quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values - if slicer is not None: - values = values[:, slicer] - mask = isna(values) + i8values = self.values.view('i8') - rvalues = np.empty(values.shape, dtype=object) - if na_rep is None: - na_rep = 'NaT' - rvalues[mask] = na_rep - imask = (~mask).ravel() + if slicer is not None: + i8values = i8values[..., slicer] - # FIXME: - # should use the formats.format.Timedelta64Formatter here - # to figure what format to pass to the Timedelta - # e.g. to not show the decimals say - rvalues.flat[imask] = np.array([Timedelta(val)._repr_base(format='all') - for val in values.ravel()[imask]], - dtype=object) - return rvalues + from pandas.io.formats.format import _get_format_datetime64_from_values + format = _get_format_datetime64_from_values(values, date_format) - def external_values(self, dtype=None): - return np.asarray(self.values.astype("timedelta64[ns]", copy=False)) + result = tslib.format_array_from_datetime( + i8values.ravel(), tz=getattr(self.values, 'tz', None), + format=format, na_rep=na_rep).reshape(i8values.shape) + return np.atleast_2d(result) + def should_store(self, value): + return (issubclass(value.dtype.type, np.datetime64) and + not is_datetime64tz_dtype(value) and + not is_extension_array_dtype(value)) -class BoolBlock(NumericBlock): - __slots__ = () - is_bool = True - _can_hold_na = False + def set(self, locs, values): + """ + Modify Block in-place with new item value - def _can_hold_element(self, element): - tipo = maybe_infer_dtype_type(element) - if tipo is not None: - return issubclass(tipo.type, np.bool_) - return isinstance(element, (bool, np.bool_)) + Returns + ------- + None + """ + values = conversion.ensure_datetime64ns(values, copy=False) - def should_store(self, value): - return (issubclass(value.dtype.type, np.bool_) and not - is_extension_array_dtype(value)) + self.values[locs] = values - def replace(self, to_replace, value, inplace=False, filter=None, - regex=False, convert=True): - inplace = validate_bool_kwarg(inplace, 'inplace') - to_replace_values = np.atleast_1d(to_replace) - if not np.can_cast(to_replace_values, bool): - return self - return super(BoolBlock, self).replace(to_replace, value, - inplace=inplace, filter=filter, - regex=regex, convert=convert) + def external_values(self): + return np.asarray(self.values.astype('datetime64[ns]', copy=False)) -class ObjectBlock(Block): +class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): + """ implement a datetime64 block with a tz attribute """ __slots__ = () - is_object = True - _can_hold_na = True - - def __init__(self, values, placement=None, ndim=2): - if issubclass(values.dtype.type, compat.string_types): - values = np.array(values, dtype=object) + is_datetimetz = True + is_extension = True - super(ObjectBlock, self).__init__(values, ndim=ndim, - placement=placement) + def __init__(self, values, placement, ndim=2, dtype=None): + # XXX: This will end up calling _maybe_coerce_values twice + # when dtype is not None. It's relatively cheap (just an isinstance) + # but it'd nice to avoid. + # + # If we can remove dtype from __init__, and push that conversion + # push onto the callers, then we can remove this entire __init__ + # and just use DatetimeBlock's. + if dtype is not None: + values = self._maybe_coerce_values(values, dtype=dtype) + super(DatetimeTZBlock, self).__init__(values, placement=placement, + ndim=ndim) @property - def is_bool(self): - """ we can be a bool if we have only bool values but are of type - object - """ - return lib.is_bool_array(self.values.ravel()) + def _holder(self): + return DatetimeArray - # TODO: Refactor when convert_objects is removed since there will be 1 path - def convert(self, *args, **kwargs): - """ attempt to coerce any object types to better types return a copy of - the block (if copy = True) by definition we ARE an ObjectBlock!!!!! + def _maybe_coerce_values(self, values, dtype=None): + """Input validation for values passed to __init__. Ensure that + we have datetime64TZ, coercing if necessary. - can return multiple blocks! + Parametetrs + ----------- + values : array-like + Must be convertible to datetime64 + dtype : string or DatetimeTZDtype, optional + Does a shallow copy to this tz + + Returns + ------- + values : ndarray[datetime64ns] """ + if not isinstance(values, self._holder): + values = self._holder(values) - if args: - raise NotImplementedError - by_item = True if 'by_item' not in kwargs else kwargs['by_item'] + if dtype is not None: + if isinstance(dtype, compat.string_types): + dtype = DatetimeTZDtype.construct_from_string(dtype) + values = type(values)(values, dtype=dtype) - new_inputs = ['coerce', 'datetime', 'numeric', 'timedelta'] - new_style = False - for kw in new_inputs: - new_style |= kw in kwargs + if values.tz is None: + raise ValueError("cannot create a DatetimeTZBlock without a tz") - if new_style: - fn = soft_convert_objects - fn_inputs = new_inputs - else: - fn = maybe_convert_objects - fn_inputs = ['convert_dates', 'convert_numeric', - 'convert_timedeltas'] - fn_inputs += ['copy'] + return values - fn_kwargs = {key: kwargs[key] for key in fn_inputs if key in kwargs} + @property + def is_view(self): + """ return a boolean if I am possibly a view """ + # check the ndarray values of the DatetimeIndex values + return self.values._data.base is not None - # operate column-by-column - def f(m, v, i): - shape = v.shape - values = fn(v.ravel(), **fn_kwargs) - try: - values = values.reshape(shape) - values = _block_shape(values, ndim=self.ndim) - except (AttributeError, NotImplementedError): - pass - - return values - - if by_item and not self._is_single_block: - blocks = self.split_and_operate(None, f, False) - else: - values = f(None, self.values.ravel(), None) - blocks = [make_block(values, ndim=self.ndim, - placement=self.mgr_locs)] - - return blocks - - def set(self, locs, values): - """ - Modify Block in-place with new item value - - Returns - ------- - None - """ - try: - self.values[locs] = values - except (ValueError): - - # broadcasting error - # see GH6171 - new_shape = list(values.shape) - new_shape[0] = len(self.items) - self.values = np.empty(tuple(new_shape), dtype=self.dtype) - self.values.fill(np.nan) - self.values[locs] = values - - def _maybe_downcast(self, blocks, downcast=None): - - if downcast is not None: - return blocks - - # split and convert the blocks - return _extend_blocks([b.convert(datetime=True, numeric=False) - for b in blocks]) - - def _can_hold_element(self, element): - return True - - def _try_coerce_args(self, values, other): - """ provide coercion to our input arguments """ - - if isinstance(other, ABCDatetimeIndex): - # May get a DatetimeIndex here. Unbox it. - other = other.array - - if isinstance(other, DatetimeArray): - # hit in pandas/tests/indexing/test_coercion.py - # ::TestWhereCoercion::test_where_series_datetime64[datetime64tz] - # when falling back to ObjectBlock.where - other = other.astype(object) - - return values, other - - def should_store(self, value): - return not (issubclass(value.dtype.type, - (np.integer, np.floating, np.complexfloating, - np.datetime64, np.bool_)) or - # TODO(ExtensionArray): remove is_extension_type - # when all extension arrays have been ported. - is_extension_type(value) or - is_extension_array_dtype(value)) - - def replace(self, to_replace, value, inplace=False, filter=None, - regex=False, convert=True): - to_rep_is_list = is_list_like(to_replace) - value_is_list = is_list_like(value) - both_lists = to_rep_is_list and value_is_list - either_list = to_rep_is_list or value_is_list - - result_blocks = [] - blocks = [self] - - if not either_list and is_re(to_replace): - return self._replace_single(to_replace, value, inplace=inplace, - filter=filter, regex=True, - convert=convert) - elif not (either_list or regex): - return super(ObjectBlock, self).replace(to_replace, value, - inplace=inplace, - filter=filter, regex=regex, - convert=convert) - elif both_lists: - for to_rep, v in zip(to_replace, value): - result_blocks = [] - for b in blocks: - result = b._replace_single(to_rep, v, inplace=inplace, - filter=filter, regex=regex, - convert=convert) - result_blocks = _extend_blocks(result, result_blocks) - blocks = result_blocks - return result_blocks - - elif to_rep_is_list and regex: - for to_rep in to_replace: - result_blocks = [] - for b in blocks: - result = b._replace_single(to_rep, value, inplace=inplace, - filter=filter, regex=regex, - convert=convert) - result_blocks = _extend_blocks(result, result_blocks) - blocks = result_blocks - return result_blocks - - return self._replace_single(to_replace, value, inplace=inplace, - filter=filter, convert=convert, - regex=regex) + def copy(self, deep=True): + """ copy constructor """ + values = self.values + if deep: + values = values.copy(deep=True) + return self.make_block_same_class(values) - def _replace_single(self, to_replace, value, inplace=False, filter=None, - regex=False, convert=True, mask=None): + def get_values(self, dtype=None): """ - Replace elements by the given value. + Returns an ndarray of values. Parameters ---------- - to_replace : object or pattern - Scalar to replace or regular expression to match. - value : object - Replacement object. - inplace : bool, default False - Perform inplace modification. - filter : list, optional - regex : bool, default False - If true, perform regular expression substitution. - convert : bool, default True - If true, try to coerce any object types to better types. - mask : array-like of bool, optional - True indicate corresponding element is ignored. + dtype : np.dtype + Only `object`-like dtypes are respected here (not sure + why). Returns ------- - a new block, the result after replacing - """ - inplace = validate_bool_kwarg(inplace, 'inplace') - - # to_replace is regex compilable - to_rep_re = regex and is_re_compilable(to_replace) - - # regex is regex compilable - regex_re = is_re_compilable(regex) - - # only one will survive - if to_rep_re and regex_re: - raise AssertionError('only one of to_replace and regex can be ' - 'regex compilable') - - # if regex was passed as something that can be a regex (rather than a - # boolean) - if regex_re: - to_replace = regex - - regex = regex_re or to_rep_re - - # try to get the pattern attribute (compiled re) or it's a string - try: - pattern = to_replace.pattern - except AttributeError: - pattern = to_replace - - # if the pattern is not empty and to_replace is either a string or a - # regex - if regex and pattern: - rx = re.compile(to_replace) - else: - # if the thing to replace is not a string or compiled regex call - # the superclass method -> to_replace is some kind of object - return super(ObjectBlock, self).replace(to_replace, value, - inplace=inplace, - filter=filter, regex=regex) - - new_values = self.values if inplace else self.values.copy() - - # deal with replacing values with objects (strings) that match but - # whose replacement is not a string (numeric, nan, object) - if isna(value) or not isinstance(value, compat.string_types): - - def re_replacer(s): - try: - return value if rx.search(s) is not None else s - except TypeError: - return s - else: - # value is guaranteed to be a string here, s can be either a string - # or null if it's null it gets returned - def re_replacer(s): - try: - return rx.sub(value, s) - except TypeError: - return s + values : ndarray + When ``dtype=object``, then and object-dtype ndarray of + boxed values is returned. Otherwise, an M8[ns] ndarray + is returned. - f = np.vectorize(re_replacer, otypes=[self.dtype]) + DatetimeArray is always 1-d. ``get_values`` will reshape + the return value to be the same dimensionality as the + block. + """ + values = self.values + if is_object_dtype(dtype): + values = values._box_values(values._data) - if filter is None: - filt = slice(None) - else: - filt = self.mgr_locs.isin(filter).nonzero()[0] + values = np.asarray(values) - if mask is None: - new_values[filt] = f(new_values[filt]) - else: - new_values[filt][mask] = f(new_values[filt][mask]) + if self.ndim == 2: + # Ensure that our shape is correct for DataFrame. + # ExtensionArrays are always 1-D, even in a DataFrame when + # the analogous NumPy-backed column would be a 2-D ndarray. + values = values.reshape(1, -1) + return values - # convert - block = self.make_block(new_values) - if convert: - block = block.convert(by_item=True, numeric=False) - return block + def _slice(self, slicer): + """ return a slice of my values """ + if isinstance(slicer, tuple): + col, loc = slicer + if not com.is_null_slice(col) and col != 0: + raise IndexError("{0} only contains one item".format(self)) + return self.values[loc] + return self.values[slicer] - def _replace_coerce(self, to_replace, value, inplace=True, regex=False, - convert=False, mask=None): + def _try_coerce_args(self, values, other): """ - Replace value corresponding to the given boolean array with another - value. + localize and return i8 for the values Parameters ---------- - to_replace : object or pattern - Scalar to replace or regular expression to match. - value : object - Replacement object. - inplace : bool, default False - Perform inplace modification. - regex : bool, default False - If true, perform regular expression substitution. - convert : bool, default True - If true, try to coerce any object types to better types. - mask : array-like of bool, optional - True indicate corresponding element is ignored. + values : ndarray-like + other : ndarray-like or scalar Returns ------- - A new block if there is anything to replace or the original block. + base-type values, base-type other """ - if mask.any(): - block = super(ObjectBlock, self)._replace_coerce( - to_replace=to_replace, value=value, inplace=inplace, - regex=regex, convert=convert, mask=mask) - if convert: - block = [b.convert(by_item=True, numeric=False, copy=True) - for b in block] - return block - return self - + # asi8 is a view, needs copy + values = _block_shape(values.view("i8"), ndim=self.ndim) -class CategoricalBlock(ExtensionBlock): - __slots__ = () - is_categorical = True - _verify_integrity = True - _can_hold_na = True - _concatenator = staticmethod(_concat._concat_categorical) + if isinstance(other, ABCSeries): + other = self._holder(other) - def __init__(self, values, placement, ndim=None): - from pandas.core.arrays.categorical import _maybe_to_categorical + if isinstance(other, bool): + raise TypeError + elif is_datetime64_dtype(other): + # add the tz back + other = self._holder(other, dtype=self.dtype) - # coerce to categorical if we can - super(CategoricalBlock, self).__init__(_maybe_to_categorical(values), - placement=placement, - ndim=ndim) + elif (is_null_datelike_scalar(other) or + (lib.is_scalar(other) and isna(other))): + other = tslibs.iNaT + elif isinstance(other, self._holder): + if other.tz != self.values.tz: + raise ValueError("incompatible or non tz-aware value") + other = _block_shape(other.asi8, ndim=self.ndim) + elif isinstance(other, (np.datetime64, datetime, date)): + other = tslibs.Timestamp(other) + tz = getattr(other, 'tz', None) - @property - def _holder(self): - return Categorical + # test we can have an equal time zone + if tz is None or str(tz) != str(self.values.tz): + raise ValueError("incompatible or non tz-aware value") + other = other.value + else: + raise TypeError - @property - def array_dtype(self): - """ the dtype to return if I want to construct this block as an - array - """ - return np.object_ + return values, other def _try_coerce_result(self, result): """ reverse of try_coerce_args """ + if isinstance(result, np.ndarray): + if result.dtype.kind in ['i', 'f', 'O']: + result = result.astype('M8[ns]') + elif isinstance(result, (np.integer, np.float, np.datetime64)): + result = self._box_func(result) + if isinstance(result, np.ndarray): + # allow passing of > 1dim if its trivial - # GH12564: CategoricalBlock is 1-dim only - # while returned results could be any dim - if ((not is_categorical_dtype(result)) and - isinstance(result, np.ndarray)): - result = _block_shape(result, ndim=self.ndim) + if result.ndim > 1: + result = result.reshape(np.prod(result.shape)) + # GH#24096 new values invalidates a frequency + result = self._holder._simple_new(result, freq=None, + tz=self.values.tz) return result - def to_dense(self): - # Categorical.get_values returns a DatetimeIndex for datetime - # categories, so we can't simply use `np.asarray(self.values)` like - # other types. - return self.values.get_values() + @property + def _box_func(self): + return lambda x: tslibs.Timestamp(x, tz=self.dtype.tz) - def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): - """ convert to our native types format, slicing if desired """ + def diff(self, n, axis=0): + """1st discrete difference - values = self.values - if slicer is not None: - # Categorical is always one dimension - values = values[slicer] - mask = isna(values) - values = np.array(values, dtype='object') - values[mask] = na_rep + Parameters + ---------- + n : int, number of periods to diff + axis : int, axis to diff upon. default 0 - # we are expected to return a 2-d ndarray - return values.reshape(1, len(values)) + Return + ------ + A list with a new TimeDeltaBlock. - def concat_same_type(self, to_concat, placement=None): + Note + ---- + The arguments here are mimicking shift so they are called correctly + by apply. """ - Concatenate list of single blocks of the same type. + if axis == 0: + # Cannot currently calculate diff across multiple blocks since this + # function is invoked via apply + raise NotImplementedError + new_values = (self.values - self.shift(n, axis=axis)[0].values).asi8 - Note that this CategoricalBlock._concat_same_type *may* not - return a CategoricalBlock. When the categories in `to_concat` - differ, this will return an object ndarray. + # Reshape the new_values like how algos.diff does for timedelta data + new_values = new_values.reshape(1, len(new_values)) + new_values = new_values.astype('timedelta64[ns]') + return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)] - If / when we decide we don't like that behavior: + def concat_same_type(self, to_concat, placement=None): + # need to handle concat([tz1, tz2]) here, since DatetimeArray + # only handles cases where all the tzs are the same. + # Instead of placing the condition here, it could also go into the + # is_uniform_join_units check, but I'm not sure what is better. + if len({x.dtype for x in to_concat}) > 1: + values = _concat._concat_datetime([x.values for x in to_concat]) + placement = placement or slice(0, len(values), 1) - 1. Change Categorical._concat_same_type to use union_categoricals - 2. Delete this method. - """ - values = self._concatenator([blk.values for blk in to_concat], - axis=self.ndim - 1) - # not using self.make_block_same_class as values can be object dtype - return make_block( - values, placement=placement or slice(0, len(values), 1), - ndim=self.ndim) + if self.ndim > 1: + values = np.atleast_2d(values) + return ObjectBlock(values, ndim=self.ndim, placement=placement) + return super(DatetimeTZBlock, self).concat_same_type(to_concat, + placement) - def where(self, other, cond, align=True, errors='raise', - try_cast=False, axis=0, transpose=False): - # TODO(CategoricalBlock.where): - # This can all be deleted in favor of ExtensionBlock.where once - # we enforce the deprecation. - object_msg = ( - "Implicitly converting categorical to object-dtype ndarray. " - "One or more of the values in 'other' are not present in this " - "categorical's categories. A future version of pandas will raise " - "a ValueError when 'other' contains different categories.\n\n" - "To preserve the current behavior, add the new categories to " - "the categorical before calling 'where', or convert the " - "categorical to a different dtype." - ) + def fillna(self, value, limit=None, inplace=False, downcast=None): + # We support filling a DatetimeTZ with a `value` whose timezone + # is different by coercing to object. try: - # Attempt to do preserve categorical dtype. - result = super(CategoricalBlock, self).where( - other, cond, align, errors, try_cast, axis, transpose + return super(DatetimeTZBlock, self).fillna( + value, limit, inplace, downcast + ) + except (ValueError, TypeError): + # different timezones, or a non-tz + return self.astype(object).fillna( + value, limit=limit, inplace=inplace, downcast=downcast ) - except (TypeError, ValueError): - warnings.warn(object_msg, FutureWarning, stacklevel=6) - result = self.astype(object).where(other, cond, align=align, - errors=errors, - try_cast=try_cast, - axis=axis, transpose=transpose) - return result + def setitem(self, indexer, value): + # https://github.com/pandas-dev/pandas/issues/24020 + # Need a dedicated setitem until #24020 (type promotion in setitem + # for extension arrays) is designed and implemented. + try: + return super(DatetimeTZBlock, self).setitem(indexer, value) + except (ValueError, TypeError): + newb = make_block(self.values.astype(object), + placement=self.mgr_locs, + klass=ObjectBlock,) + return newb.setitem(indexer, value) -class DatetimeBlock(DatetimeLikeBlockMixin, Block): + +class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock): __slots__ = () - is_datetime = True + is_timedelta = True _can_hold_na = True + is_numeric = False def __init__(self, values, placement, ndim=None): - values = self._maybe_coerce_values(values) - super(DatetimeBlock, self).__init__(values, - placement=placement, ndim=ndim) - - def _maybe_coerce_values(self, values): - """Input validation for values passed to __init__. Ensure that - we have datetime64ns, coercing if necessary. - - Parameters - ---------- - values : array-like - Must be convertible to datetime64 - - Returns - ------- - values : ndarray[datetime64ns] - - Overridden by DatetimeTZBlock. - """ - if values.dtype != _NS_DTYPE: - values = conversion.ensure_datetime64ns(values) - - if isinstance(values, DatetimeArray): + if values.dtype != _TD_DTYPE: + values = conversion.ensure_timedelta64ns(values) + if isinstance(values, TimedeltaArray): values = values._data - assert isinstance(values, np.ndarray), type(values) - return values - - def _astype(self, dtype, **kwargs): - """ - these automatically copy, so copy=True has no effect - raise on an except if raise == True - """ - dtype = pandas_dtype(dtype) + super(TimeDeltaBlock, self).__init__(values, + placement=placement, ndim=ndim) - # if we are passed a datetime64[ns, tz] - if is_datetime64tz_dtype(dtype): - values = self.values - if getattr(values, 'tz', None) is None: - values = DatetimeIndex(values).tz_localize('UTC') - values = values.tz_convert(dtype.tz) - return self.make_block(values) + @property + def _holder(self): + return TimedeltaArray - # delegate - return super(DatetimeBlock, self)._astype(dtype=dtype, **kwargs) + @property + def _box_func(self): + return lambda x: Timedelta(x, unit='ns') def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: - return tipo == _NS_DTYPE or tipo == np.int64 - return (is_integer(element) or isinstance(element, datetime) or - isna(element)) + return issubclass(tipo.type, (np.timedelta64, np.int64)) + return is_integer(element) or isinstance( + element, (timedelta, np.timedelta64, np.int64)) + + def fillna(self, value, **kwargs): + + # allow filling with integers to be + # interpreted as seconds + if is_integer(value) and not isinstance(value, np.timedelta64): + value = Timedelta(value, unit='s') + return super(TimeDeltaBlock, self).fillna(value, **kwargs) def _try_coerce_args(self, values, other): """ - Coerce values and other to dtype 'i8'. NaN and NaT convert to - the smallest i8, and will correctly round-trip to NaT if converted - back in _try_coerce_result. values is always ndarray-like, other - may not be + Coerce values and other to int64, with null values converted to + iNaT. values is always ndarray-like, other may not be Parameters ---------- @@ -2833,20 +2653,19 @@ def _try_coerce_args(self, values, other): ------- base-type values, base-type other """ - values = values.view('i8') if isinstance(other, bool): raise TypeError elif is_null_datelike_scalar(other): other = tslibs.iNaT - elif isinstance(other, (datetime, np.datetime64, date)): - other = self._box_func(other) - if getattr(other, 'tz') is not None: - raise TypeError("cannot coerce a Timestamp with a tz on a " - "naive Block") - other = other.asm8.view('i8') - elif hasattr(other, 'dtype') and is_datetime64_dtype(other): + elif isinstance(other, Timedelta): + other = other.value + elif isinstance(other, timedelta): + other = Timedelta(other).value + elif isinstance(other, np.timedelta64): + other = Timedelta(other).value + elif hasattr(other, 'dtype') and is_timedelta64_dtype(other): other = other.astype('i8', copy=False).view('i8') else: # coercion issues @@ -2856,43 +2675,141 @@ def _try_coerce_args(self, values, other): return values, other def _try_coerce_result(self, result): - """ reverse of try_coerce_args """ + """ reverse of try_coerce_args / try_operate """ if isinstance(result, np.ndarray): + mask = isna(result) if result.dtype.kind in ['i', 'f', 'O']: - try: - result = result.astype('M8[ns]') - except ValueError: - pass - elif isinstance(result, (np.integer, np.float, np.datetime64)): + result = result.astype('m8[ns]') + result[mask] = tslibs.iNaT + elif isinstance(result, (np.integer, np.float)): result = self._box_func(result) return result + def should_store(self, value): + return (issubclass(value.dtype.type, np.timedelta64) and + not is_extension_array_dtype(value)) + + def to_native_types(self, slicer=None, na_rep=None, quoting=None, + **kwargs): + """ convert to our native types format, slicing if desired """ + + values = self.values + if slicer is not None: + values = values[:, slicer] + mask = isna(values) + + rvalues = np.empty(values.shape, dtype=object) + if na_rep is None: + na_rep = 'NaT' + rvalues[mask] = na_rep + imask = (~mask).ravel() + + # FIXME: + # should use the formats.format.Timedelta64Formatter here + # to figure what format to pass to the Timedelta + # e.g. to not show the decimals say + rvalues.flat[imask] = np.array([Timedelta(val)._repr_base(format='all') + for val in values.ravel()[imask]], + dtype=object) + return rvalues + + def external_values(self, dtype=None): + return np.asarray(self.values.astype("timedelta64[ns]", copy=False)) + + +class BoolBlock(NumericBlock): + __slots__ = () + is_bool = True + _can_hold_na = False + + def _can_hold_element(self, element): + tipo = maybe_infer_dtype_type(element) + if tipo is not None: + return issubclass(tipo.type, np.bool_) + return isinstance(element, (bool, np.bool_)) + + def should_store(self, value): + return (issubclass(value.dtype.type, np.bool_) and not + is_extension_array_dtype(value)) + + def replace(self, to_replace, value, inplace=False, filter=None, + regex=False, convert=True): + inplace = validate_bool_kwarg(inplace, 'inplace') + to_replace_values = np.atleast_1d(to_replace) + if not np.can_cast(to_replace_values, bool): + return self + return super(BoolBlock, self).replace(to_replace, value, + inplace=inplace, filter=filter, + regex=regex, convert=convert) + + +class ObjectBlock(Block): + __slots__ = () + is_object = True + _can_hold_na = True + + def __init__(self, values, placement=None, ndim=2): + if issubclass(values.dtype.type, compat.string_types): + values = np.array(values, dtype=object) + + super(ObjectBlock, self).__init__(values, ndim=ndim, + placement=placement) + @property - def _box_func(self): - return tslibs.Timestamp + def is_bool(self): + """ we can be a bool if we have only bool values but are of type + object + """ + return lib.is_bool_array(self.values.ravel()) + + # TODO: Refactor when convert_objects is removed since there will be 1 path + def convert(self, *args, **kwargs): + """ attempt to coerce any object types to better types return a copy of + the block (if copy = True) by definition we ARE an ObjectBlock!!!!! + + can return multiple blocks! + """ + + if args: + raise NotImplementedError + by_item = True if 'by_item' not in kwargs else kwargs['by_item'] - def to_native_types(self, slicer=None, na_rep=None, date_format=None, - quoting=None, **kwargs): - """ convert to our native types format, slicing if desired """ + new_inputs = ['coerce', 'datetime', 'numeric', 'timedelta'] + new_style = False + for kw in new_inputs: + new_style |= kw in kwargs - values = self.values - i8values = self.values.view('i8') + if new_style: + fn = soft_convert_objects + fn_inputs = new_inputs + else: + fn = maybe_convert_objects + fn_inputs = ['convert_dates', 'convert_numeric', + 'convert_timedeltas'] + fn_inputs += ['copy'] - if slicer is not None: - i8values = i8values[..., slicer] + fn_kwargs = {key: kwargs[key] for key in fn_inputs if key in kwargs} - from pandas.io.formats.format import _get_format_datetime64_from_values - format = _get_format_datetime64_from_values(values, date_format) + # operate column-by-column + def f(m, v, i): + shape = v.shape + values = fn(v.ravel(), **fn_kwargs) + try: + values = values.reshape(shape) + values = _block_shape(values, ndim=self.ndim) + except (AttributeError, NotImplementedError): + pass - result = tslib.format_array_from_datetime( - i8values.ravel(), tz=getattr(self.values, 'tz', None), - format=format, na_rep=na_rep).reshape(i8values.shape) - return np.atleast_2d(result) + return values - def should_store(self, value): - return (issubclass(value.dtype.type, np.datetime64) and - not is_datetime64tz_dtype(value) and - not is_extension_array_dtype(value)) + if by_item and not self._is_single_block: + blocks = self.split_and_operate(None, f, False) + else: + values = f(None, self.values.ravel(), None) + blocks = [make_block(values, ndim=self.ndim, + placement=self.mgr_locs)] + + return blocks def set(self, locs, values): """ @@ -2902,255 +2819,338 @@ def set(self, locs, values): ------- None """ - values = conversion.ensure_datetime64ns(values, copy=False) + try: + self.values[locs] = values + except (ValueError): - self.values[locs] = values + # broadcasting error + # see GH6171 + new_shape = list(values.shape) + new_shape[0] = len(self.items) + self.values = np.empty(tuple(new_shape), dtype=self.dtype) + self.values.fill(np.nan) + self.values[locs] = values - def external_values(self): - return np.asarray(self.values.astype('datetime64[ns]', copy=False)) + def _maybe_downcast(self, blocks, downcast=None): + if downcast is not None: + return blocks -class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): - """ implement a datetime64 block with a tz attribute """ - __slots__ = () - is_datetimetz = True - is_extension = True + # split and convert the blocks + return _extend_blocks([b.convert(datetime=True, numeric=False) + for b in blocks]) - def __init__(self, values, placement, ndim=2, dtype=None): - # XXX: This will end up calling _maybe_coerce_values twice - # when dtype is not None. It's relatively cheap (just an isinstance) - # but it'd nice to avoid. - # - # If we can remove dtype from __init__, and push that conversion - # push onto the callers, then we can remove this entire __init__ - # and just use DatetimeBlock's. - if dtype is not None: - values = self._maybe_coerce_values(values, dtype=dtype) - super(DatetimeTZBlock, self).__init__(values, placement=placement, - ndim=ndim) + def _can_hold_element(self, element): + return True - @property - def _holder(self): - return DatetimeArray + def _try_coerce_args(self, values, other): + """ provide coercion to our input arguments """ - def _maybe_coerce_values(self, values, dtype=None): - """Input validation for values passed to __init__. Ensure that - we have datetime64TZ, coercing if necessary. + if isinstance(other, ABCDatetimeIndex): + # May get a DatetimeIndex here. Unbox it. + other = other.array - Parametetrs - ----------- - values : array-like - Must be convertible to datetime64 - dtype : string or DatetimeTZDtype, optional - Does a shallow copy to this tz + if isinstance(other, DatetimeArray): + # hit in pandas/tests/indexing/test_coercion.py + # ::TestWhereCoercion::test_where_series_datetime64[datetime64tz] + # when falling back to ObjectBlock.where + other = other.astype(object) - Returns - ------- - values : ndarray[datetime64ns] - """ - if not isinstance(values, self._holder): - values = self._holder(values) + return values, other - if dtype is not None: - if isinstance(dtype, compat.string_types): - dtype = DatetimeTZDtype.construct_from_string(dtype) - values = type(values)(values, dtype=dtype) + def should_store(self, value): + return not (issubclass(value.dtype.type, + (np.integer, np.floating, np.complexfloating, + np.datetime64, np.bool_)) or + # TODO(ExtensionArray): remove is_extension_type + # when all extension arrays have been ported. + is_extension_type(value) or + is_extension_array_dtype(value)) - if values.tz is None: - raise ValueError("cannot create a DatetimeTZBlock without a tz") + def replace(self, to_replace, value, inplace=False, filter=None, + regex=False, convert=True): + to_rep_is_list = is_list_like(to_replace) + value_is_list = is_list_like(value) + both_lists = to_rep_is_list and value_is_list + either_list = to_rep_is_list or value_is_list - return values + result_blocks = [] + blocks = [self] - @property - def is_view(self): - """ return a boolean if I am possibly a view """ - # check the ndarray values of the DatetimeIndex values - return self.values._data.base is not None + if not either_list and is_re(to_replace): + return self._replace_single(to_replace, value, inplace=inplace, + filter=filter, regex=True, + convert=convert) + elif not (either_list or regex): + return super(ObjectBlock, self).replace(to_replace, value, + inplace=inplace, + filter=filter, regex=regex, + convert=convert) + elif both_lists: + for to_rep, v in zip(to_replace, value): + result_blocks = [] + for b in blocks: + result = b._replace_single(to_rep, v, inplace=inplace, + filter=filter, regex=regex, + convert=convert) + result_blocks = _extend_blocks(result, result_blocks) + blocks = result_blocks + return result_blocks - def copy(self, deep=True): - """ copy constructor """ - values = self.values - if deep: - values = values.copy(deep=True) - return self.make_block_same_class(values) + elif to_rep_is_list and regex: + for to_rep in to_replace: + result_blocks = [] + for b in blocks: + result = b._replace_single(to_rep, value, inplace=inplace, + filter=filter, regex=regex, + convert=convert) + result_blocks = _extend_blocks(result, result_blocks) + blocks = result_blocks + return result_blocks - def get_values(self, dtype=None): + return self._replace_single(to_replace, value, inplace=inplace, + filter=filter, convert=convert, + regex=regex) + + def _replace_single(self, to_replace, value, inplace=False, filter=None, + regex=False, convert=True, mask=None): """ - Returns an ndarray of values. + Replace elements by the given value. Parameters ---------- - dtype : np.dtype - Only `object`-like dtypes are respected here (not sure - why). + to_replace : object or pattern + Scalar to replace or regular expression to match. + value : object + Replacement object. + inplace : bool, default False + Perform inplace modification. + filter : list, optional + regex : bool, default False + If true, perform regular expression substitution. + convert : bool, default True + If true, try to coerce any object types to better types. + mask : array-like of bool, optional + True indicate corresponding element is ignored. Returns ------- - values : ndarray - When ``dtype=object``, then and object-dtype ndarray of - boxed values is returned. Otherwise, an M8[ns] ndarray - is returned. + a new block, the result after replacing + """ + inplace = validate_bool_kwarg(inplace, 'inplace') + + # to_replace is regex compilable + to_rep_re = regex and is_re_compilable(to_replace) + + # regex is regex compilable + regex_re = is_re_compilable(regex) + + # only one will survive + if to_rep_re and regex_re: + raise AssertionError('only one of to_replace and regex can be ' + 'regex compilable') + + # if regex was passed as something that can be a regex (rather than a + # boolean) + if regex_re: + to_replace = regex + + regex = regex_re or to_rep_re + + # try to get the pattern attribute (compiled re) or it's a string + try: + pattern = to_replace.pattern + except AttributeError: + pattern = to_replace + + # if the pattern is not empty and to_replace is either a string or a + # regex + if regex and pattern: + rx = re.compile(to_replace) + else: + # if the thing to replace is not a string or compiled regex call + # the superclass method -> to_replace is some kind of object + return super(ObjectBlock, self).replace(to_replace, value, + inplace=inplace, + filter=filter, regex=regex) + + new_values = self.values if inplace else self.values.copy() + + # deal with replacing values with objects (strings) that match but + # whose replacement is not a string (numeric, nan, object) + if isna(value) or not isinstance(value, compat.string_types): + + def re_replacer(s): + try: + return value if rx.search(s) is not None else s + except TypeError: + return s + else: + # value is guaranteed to be a string here, s can be either a string + # or null if it's null it gets returned + def re_replacer(s): + try: + return rx.sub(value, s) + except TypeError: + return s - DatetimeArray is always 1-d. ``get_values`` will reshape - the return value to be the same dimensionality as the - block. - """ - values = self.values - if is_object_dtype(dtype): - values = values._box_values(values._data) + f = np.vectorize(re_replacer, otypes=[self.dtype]) - values = np.asarray(values) + if filter is None: + filt = slice(None) + else: + filt = self.mgr_locs.isin(filter).nonzero()[0] - if self.ndim == 2: - # Ensure that our shape is correct for DataFrame. - # ExtensionArrays are always 1-D, even in a DataFrame when - # the analogous NumPy-backed column would be a 2-D ndarray. - values = values.reshape(1, -1) - return values + if mask is None: + new_values[filt] = f(new_values[filt]) + else: + new_values[filt][mask] = f(new_values[filt][mask]) - def _slice(self, slicer): - """ return a slice of my values """ - if isinstance(slicer, tuple): - col, loc = slicer - if not com.is_null_slice(col) and col != 0: - raise IndexError("{0} only contains one item".format(self)) - return self.values[loc] - return self.values[slicer] + # convert + block = self.make_block(new_values) + if convert: + block = block.convert(by_item=True, numeric=False) + return block - def _try_coerce_args(self, values, other): + def _replace_coerce(self, to_replace, value, inplace=True, regex=False, + convert=False, mask=None): """ - localize and return i8 for the values + Replace value corresponding to the given boolean array with another + value. Parameters ---------- - values : ndarray-like - other : ndarray-like or scalar + to_replace : object or pattern + Scalar to replace or regular expression to match. + value : object + Replacement object. + inplace : bool, default False + Perform inplace modification. + regex : bool, default False + If true, perform regular expression substitution. + convert : bool, default True + If true, try to coerce any object types to better types. + mask : array-like of bool, optional + True indicate corresponding element is ignored. Returns ------- - base-type values, base-type other + A new block if there is anything to replace or the original block. """ - # asi8 is a view, needs copy - values = _block_shape(values.view("i8"), ndim=self.ndim) + if mask.any(): + block = super(ObjectBlock, self)._replace_coerce( + to_replace=to_replace, value=value, inplace=inplace, + regex=regex, convert=convert, mask=mask) + if convert: + block = [b.convert(by_item=True, numeric=False, copy=True) + for b in block] + return block + return self - if isinstance(other, ABCSeries): - other = self._holder(other) - if isinstance(other, bool): - raise TypeError - elif is_datetime64_dtype(other): - # add the tz back - other = self._holder(other, dtype=self.dtype) +class CategoricalBlock(ExtensionBlock): + __slots__ = () + is_categorical = True + _verify_integrity = True + _can_hold_na = True + _concatenator = staticmethod(_concat._concat_categorical) - elif (is_null_datelike_scalar(other) or - (lib.is_scalar(other) and isna(other))): - other = tslibs.iNaT - elif isinstance(other, self._holder): - if other.tz != self.values.tz: - raise ValueError("incompatible or non tz-aware value") - other = _block_shape(other.asi8, ndim=self.ndim) - elif isinstance(other, (np.datetime64, datetime, date)): - other = tslibs.Timestamp(other) - tz = getattr(other, 'tz', None) + def __init__(self, values, placement, ndim=None): + from pandas.core.arrays.categorical import _maybe_to_categorical - # test we can have an equal time zone - if tz is None or str(tz) != str(self.values.tz): - raise ValueError("incompatible or non tz-aware value") - other = other.value - else: - raise TypeError + # coerce to categorical if we can + super(CategoricalBlock, self).__init__(_maybe_to_categorical(values), + placement=placement, + ndim=ndim) - return values, other + @property + def _holder(self): + return Categorical + + @property + def array_dtype(self): + """ the dtype to return if I want to construct this block as an + array + """ + return np.object_ def _try_coerce_result(self, result): """ reverse of try_coerce_args """ - if isinstance(result, np.ndarray): - if result.dtype.kind in ['i', 'f', 'O']: - result = result.astype('M8[ns]') - elif isinstance(result, (np.integer, np.float, np.datetime64)): - result = tslibs.Timestamp(result, tz=self.values.tz) - if isinstance(result, np.ndarray): - # allow passing of > 1dim if its trivial - if result.ndim > 1: - result = result.reshape(np.prod(result.shape)) - # GH#24096 new values invalidates a frequency - result = self._holder._simple_new(result, freq=None, - tz=self.values.tz) + # GH12564: CategoricalBlock is 1-dim only + # while returned results could be any dim + if ((not is_categorical_dtype(result)) and + isinstance(result, np.ndarray)): + result = _block_shape(result, ndim=self.ndim) return result - @property - def _box_func(self): - return lambda x: tslibs.Timestamp(x, tz=self.dtype.tz) + def to_dense(self): + # Categorical.get_values returns a DatetimeIndex for datetime + # categories, so we can't simply use `np.asarray(self.values)` like + # other types. + return self.values.get_values() - def diff(self, n, axis=0): - """1st discrete difference + def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): + """ convert to our native types format, slicing if desired """ - Parameters - ---------- - n : int, number of periods to diff - axis : int, axis to diff upon. default 0 + values = self.values + if slicer is not None: + # Categorical is always one dimension + values = values[slicer] + mask = isna(values) + values = np.array(values, dtype='object') + values[mask] = na_rep - Return - ------ - A list with a new TimeDeltaBlock. + # we are expected to return a 2-d ndarray + return values.reshape(1, len(values)) - Note - ---- - The arguments here are mimicking shift so they are called correctly - by apply. + def concat_same_type(self, to_concat, placement=None): """ - if axis == 0: - # Cannot currently calculate diff across multiple blocks since this - # function is invoked via apply - raise NotImplementedError - new_values = (self.values - self.shift(n, axis=axis)[0].values).asi8 + Concatenate list of single blocks of the same type. - # Reshape the new_values like how algos.diff does for timedelta data - new_values = new_values.reshape(1, len(new_values)) - new_values = new_values.astype('timedelta64[ns]') - return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)] + Note that this CategoricalBlock._concat_same_type *may* not + return a CategoricalBlock. When the categories in `to_concat` + differ, this will return an object ndarray. - def concat_same_type(self, to_concat, placement=None): - # need to handle concat([tz1, tz2]) here, since DatetimeArray - # only handles cases where all the tzs are the same. - # Instead of placing the condition here, it could also go into the - # is_uniform_join_units check, but I'm not sure what is better. - if len({x.dtype for x in to_concat}) > 1: - values = _concat._concat_datetime([x.values for x in to_concat]) - placement = placement or slice(0, len(values), 1) + If / when we decide we don't like that behavior: - if self.ndim > 1: - values = np.atleast_2d(values) - return ObjectBlock(values, ndim=self.ndim, placement=placement) - return super(DatetimeTZBlock, self).concat_same_type(to_concat, - placement) + 1. Change Categorical._concat_same_type to use union_categoricals + 2. Delete this method. + """ + values = self._concatenator([blk.values for blk in to_concat], + axis=self.ndim - 1) + # not using self.make_block_same_class as values can be object dtype + return make_block( + values, placement=placement or slice(0, len(values), 1), + ndim=self.ndim) - def fillna(self, value, limit=None, inplace=False, downcast=None): - # We support filling a DatetimeTZ with a `value` whose timezone - # is different by coercing to object. + def where(self, other, cond, align=True, errors='raise', + try_cast=False, axis=0, transpose=False): + # TODO(CategoricalBlock.where): + # This can all be deleted in favor of ExtensionBlock.where once + # we enforce the deprecation. + object_msg = ( + "Implicitly converting categorical to object-dtype ndarray. " + "One or more of the values in 'other' are not present in this " + "categorical's categories. A future version of pandas will raise " + "a ValueError when 'other' contains different categories.\n\n" + "To preserve the current behavior, add the new categories to " + "the categorical before calling 'where', or convert the " + "categorical to a different dtype." + ) try: - return super(DatetimeTZBlock, self).fillna( - value, limit, inplace, downcast - ) - except (ValueError, TypeError): - # different timezones, or a non-tz - return self.astype(object).fillna( - value, limit=limit, inplace=inplace, downcast=downcast + # Attempt to do preserve categorical dtype. + result = super(CategoricalBlock, self).where( + other, cond, align, errors, try_cast, axis, transpose ) - - def setitem(self, indexer, value): - # https://github.com/pandas-dev/pandas/issues/24020 - # Need a dedicated setitem until #24020 (type promotion in setitem - # for extension arrays) is designed and implemented. - try: - return super(DatetimeTZBlock, self).setitem(indexer, value) - except (ValueError, TypeError): - newb = make_block(self.values.astype(object), - placement=self.mgr_locs, - klass=ObjectBlock,) - return newb.setitem(indexer, value) + except (TypeError, ValueError): + warnings.warn(object_msg, FutureWarning, stacklevel=6) + result = self.astype(object).where(other, cond, align=align, + errors=errors, + try_cast=try_cast, + axis=axis, transpose=transpose) + return result # -----------------------------------------------------------------