From 221b493e6fc4fed9ef4a51eaea341f9d9c936de5 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 7 Mar 2015 10:39:44 -0500 Subject: [PATCH 1/2] API: deprecate setting of .ordered directly (GH9347, GH9190) add set_ordered method for setting ordered default for Categorical is now to NOT order unless explicity specified whatsnew doc updates for categorical api changes add ability to specify keywords to astype for creation defaults fix issue with grouping with sort=True on an unordered Categorical update categorical.rst docs test unsortable when ordered=True v0.16.0.txt / release notes updates clean up check for ordering allow groupby to work on an unordered categorical --- doc/source/api.rst | 2 + doc/source/categorical.rst | 40 +++-- doc/source/release.rst | 1 + doc/source/whatsnew/v0.16.0.txt | 145 ++++++++++++++++ pandas/core/categorical.py | 125 ++++++++++---- pandas/core/generic.py | 5 +- pandas/core/groupby.py | 12 +- pandas/core/index.py | 4 +- pandas/core/internals.py | 16 +- pandas/core/panel.py | 4 +- pandas/core/reshape.py | 7 +- pandas/io/pytables.py | 2 +- pandas/tests/data/categorical_0_15_2.pickle | Bin 0 -> 392 bytes pandas/tests/test_categorical.py | 175 +++++++++++++++----- pandas/tests/test_groupby.py | 30 +++- pandas/tools/merge.py | 4 +- 16 files changed, 461 insertions(+), 111 deletions(-) create mode 100644 pandas/tests/data/categorical_0_15_2.pickle diff --git a/doc/source/api.rst b/doc/source/api.rst index 7fbb432f0be6b..58ea517d055a0 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -585,6 +585,8 @@ following usable methods and properties (all available as ``Series.cat.` for :func:`~pandas.cut`. By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to a `DataFrame`. -This is the only possibility to specify differently ordered categories (or no order at all) at -creation time and the only reason to use :class:`pandas.Categorical` directly: .. ipython:: python @@ -103,6 +101,14 @@ creation time and the only reason to use :class:`pandas.Categorical` directly: df["B"] = raw_cat df +You can also specify differently ordered categories or make the resulting data ordered, by passing these arguments to ``astype()``: + +.. ipython:: python + + s = Series(["a","b","c","a"]) + s_cat = s.astype("category", categories=["b","c","d"], ordered=False) + s_cat + Categorical data has a specific ``category`` :ref:`dtype `: .. ipython:: python @@ -176,10 +182,9 @@ It's also possible to pass in the categories in a specific order: s.cat.ordered .. note:: - New categorical data is automatically ordered if the passed in values are sortable or a - `categories` argument is supplied. This is a difference to R's `factors`, which are unordered - unless explicitly told to be ordered (``ordered=TRUE``). You can of course overwrite that by - passing in an explicit ``ordered=False``. + + New categorical data are NOT automatically ordered. You must explicity pass ``ordered=True`` to + indicate an ordered ``Categorical``. Renaming categories @@ -270,6 +275,10 @@ Sorting and Order .. _categorical.sort: +.. warning:: + + The default for construction has change in v0.16.0 to ``ordered=False``, from the prior implicit ``ordered=True`` + If categorical data is ordered (``s.cat.ordered == True``), then the order of the categories has a meaning and certain operations are possible. If the categorical is unordered, a `TypeError` is raised. @@ -281,18 +290,26 @@ raised. s.sort() except TypeError as e: print("TypeError: " + str(e)) - s = Series(["a","b","c","a"], dtype="category") # ordered per default! + s = Series(["a","b","c","a"]).astype('category', ordered=True) s.sort() s s.min(), s.max() +You can set categorical data to be ordered by using ``as_ordered()`` or unordered by using ``as_unordered()``. These will by +default return a *new* object. + +.. ipython:: python + + s.cat.as_ordered() + s.cat.as_unordered() + Sorting will use the order defined by categories, not any lexical order present on the data type. This is even true for strings and numeric data: .. ipython:: python s = Series([1,2,3,1], dtype="category") - s.cat.categories = [2,3,1] + s = s.cat.set_categories([2,3,1], ordered=True) s s.sort() s @@ -310,7 +327,7 @@ necessarily make the sort order the same as the categories order. .. ipython:: python s = Series([1,2,3,1], dtype="category") - s = s.cat.reorder_categories([2,3,1]) + s = s.cat.reorder_categories([2,3,1], ordered=True) s s.sort() s @@ -339,7 +356,7 @@ The ordering of the categorical is determined by the ``categories`` of that colu .. ipython:: python - dfs = DataFrame({'A' : Categorical(list('bbeebbaa'),categories=['e','a','b']), + dfs = DataFrame({'A' : Categorical(list('bbeebbaa'),categories=['e','a','b'],ordered=True), 'B' : [1,2,1,2,2,1,2,1] }) dfs.sort(['A','B']) @@ -664,9 +681,6 @@ The following differences to R's factor functions can be observed: * R's `levels` are named `categories` * R's `levels` are always of type string, while `categories` in pandas can be of any dtype. -* New categorical data is automatically ordered if the passed in values are sortable or a - `categories` argument is supplied. This is a difference to R's `factors`, which are unordered - unless explicitly told to be ordered (``ordered=TRUE``). * It's not possible to specify labels at creation time. Use ``s.cat.rename_categories(new_labels)`` afterwards. * In contrast to R's `factor` function, using categorical data as the sole input to create a diff --git a/doc/source/release.rst b/doc/source/release.rst index d3d9f031f4598..b60a47bb72daa 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -59,6 +59,7 @@ Highlights include: - ``Series.to_coo/from_coo`` methods to interact with ``scipy.sparse``, see :ref:`here ` - Backwards incompatible change to ``Timedelta`` to conform the ``.seconds`` attribute with ``datetime.timedelta``, see :ref:`here ` - Changes to the ``.loc`` slicing API to conform with the behavior of ``.ix`` see :ref:`here ` +- Changes to the default for ordering in the ``Categorical`` constructor, see :ref:`here ` See the :ref:`v0.16.0 Whatsnew ` overview or the issue tracker on GitHub for an extensive list of all API changes, enhancements and bugs that have been fixed in 0.16.0. diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index 3f37b44f509ff..765324b3aceb4 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -13,6 +13,7 @@ users upgrade to this version. * ``Series.to_coo/from_coo`` methods to interact with ``scipy.sparse``, see :ref:`here ` * Backwards incompatible change to ``Timedelta`` to conform the ``.seconds`` attribute with ``datetime.timedelta``, see :ref:`here ` * Changes to the ``.loc`` slicing API to conform with the behavior of ``.ix`` see :ref:`here ` + * Changes to the default for ordering in the ``Categorical`` constructor, see :ref:`here ` - Check the :ref:`API Changes ` and :ref:`deprecations ` before updating @@ -366,6 +367,150 @@ API Changes - ``Series.describe`` for categorical data will now give counts and frequencies of 0, not ``NaN``, for unused categories (:issue:`9443`) +Categorical Changes +~~~~~~~~~~~~~~~~~~~ + +.. _whatsnew_0160.api_breaking.categorical: + +In prior versions, ``Categoricals`` that had an unspecified ordering (meaning no ``ordered`` keyword was passed) were defaulted as ``ordered`` Categoricals. Going forward, the ``ordered`` keyword in the ``Categorical`` constructor will default to ``False``. Ordering must now be explicit. + +Furthermore, previously you *could* change the ``ordered`` attribute of a Categorical by just setting the attribute, e.g. ``cat.ordered=True``; This is now deprecated and you should use ``cat.as_ordered()`` or ``cat.as_unordered()``. These will by default return a **new** object and not modify the existing object. (:issue:`9347`, :issue:`9190`) + +Previous Behavior + +.. code-block:: python + + In [3]: s = Series([0,1,2], dtype='category') + + In [4]: s + Out[4]: + 0 0 + 1 1 + 2 2 + dtype: category + Categories (3, int64): [0 < 1 < 2] + + In [5]: s.cat.ordered + Out[5]: True + + In [6]: s.cat.ordered = False + + In [7]: s + Out[7]: + 0 0 + 1 1 + 2 2 + dtype: category + Categories (3, int64): [0, 1, 2] + +New Behavior + +.. ipython:: python + + s = Series([0,1,2], dtype='category') + s + s.cat.ordered + s = s.cat.as_ordered() + s + s.cat.ordered + + # you can set in the constructor of the Categorical + s = Series(Categorical([0,1,2],ordered=True)) + s + s.cat.ordered + +For ease of creation of series of categorical data, we have added the ability to pass keywords when calling ``.astype()``. These are passed directly to the constructor. + +.. ipython:: python + + s = Series(["a","b","c","a"]).astype('category',ordered=True) + s + s = Series(["a","b","c","a"]).astype('category',categories=list('abcdef'),ordered=False) + s + +.. warning:: + + This simple API change may have suprising effects if a user is relying on the previous defaulted behavior implicity. In particular, + sorting operations with a ``Categorical`` will now raise an error: + + .. code-block:: python + + In [1]: df = DataFrame({ 'A' : Series(list('aabc')).astype('category'), 'B' : np.arange(4) }) + + In [2]: df['A'].order() + TypeError: Categorical is not ordered for operation argsort + you can use .as_ordered() to change the Categorical to an ordered one + + The solution is to make 'A' orderable, e.g. ``df['A'] = df['A'].cat.as_ordered()`` + + +Indexing Changes +~~~~~~~~~~~~~~~~ + +.. _whatsnew_0160.api_breaking.indexing: + +The behavior of a small sub-set of edge cases for using ``.loc`` have changed (:issue:`8613`). Furthermore we have improved the content of the error messages that are raised: + +- slicing with ``.loc`` where the start and/or stop bound is not found in the index is now allowed; this previously would raise a ``KeyError``. This makes the behavior the same as ``.ix`` in this case. This change is only for slicing, not when indexing with a single label. + +.. ipython:: python + + df = DataFrame(np.random.randn(5,4), + columns=list('ABCD'), + index=date_range('20130101',periods=5)) + df + s = Series(range(5),[-2,-1,1,2,3]) + s + + Previous Behavior + + .. code-block:: python + + In [4]: df.loc['2013-01-02':'2013-01-10'] + KeyError: 'stop bound [2013-01-10] is not in the [index]' + + In [6]: s.loc[-10:3] + KeyError: 'start bound [-10] is not the [index]' + + New Behavior + + .. ipython:: python + + df.loc['2013-01-02':'2013-01-10'] + s.loc[-10:3] + +- allow slicing with float-like values on an integer index for ``.ix``. Previously this was only enabled for ``.loc``: + + Previous Behavior + + .. code-block:: python + + In [8]: s.ix[-1.0:2] + TypeError: the slice start value [-1.0] is not a proper indexer for this index type (Int64Index) + + New Behavior + + .. ipython:: python + + s.ix[-1.0:2] + +- provide a useful exception for indexing with an invalid type for that index when using ``.loc``. For example trying to use ``.loc`` on an index of type ``DatetimeIndex`` or ``PeriodIndex`` or ``TimedeltaIndex``, with an integer (or a float). + + Previous Behavior + + .. code-block:: python + + In [4]: df.loc[2:3] + KeyError: 'start bound [2] is not the [index]' + + New Behavior + + .. code-block:: python + + In [4]: df.loc[2:3] + TypeError: Cannot do slice indexing on with keys + + .. _whatsnew_0160.deprecations: Deprecations diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index f4246ed057c89..ee76527b89974 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -17,7 +17,7 @@ from pandas.core.common import (CategoricalDtype, ABCSeries, isnull, notnull, is_categorical_dtype, is_integer_dtype, is_object_dtype, _possibly_infer_to_datetimelike, get_dtype_kinds, - is_list_like, is_sequence, is_null_slice, + is_list_like, is_sequence, is_null_slice, is_bool, _ensure_platform_int, _ensure_object, _ensure_int64, _coerce_indexer_dtype, _values_from_object, take_1d) from pandas.util.terminal import get_terminal_size @@ -139,9 +139,9 @@ class Categorical(PandasObject): categories : Index-like (unique), optional The unique categories for this categorical. If not given, the categories are assumed to be the unique values of values. - ordered : boolean, optional + ordered : boolean, (default False) Whether or not this categorical is treated as a ordered categorical. If not given, - the resulting categorical will be ordered if values can be sorted. + the resulting categorical will not be ordered. name : str, optional Name for the Categorical variable. If name is None, will attempt to infer from values. @@ -184,7 +184,6 @@ class Categorical(PandasObject): dtype = CategoricalDtype() """The dtype (always "category")""" - ordered = None """Whether or not this Categorical is ordered. Only ordered `Categoricals` can be sorted (according to the order @@ -201,10 +200,9 @@ class Categorical(PandasObject): # For comparisons, so that numpy uses our implementation if the compare ops, which raise __array_priority__ = 1000 _typ = 'categorical' - ordered = False name = None - def __init__(self, values, categories=None, ordered=None, name=None, fastpath=False, + def __init__(self, values, categories=None, ordered=False, name=None, fastpath=False, levels=None): if fastpath: @@ -212,7 +210,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa self._codes = _coerce_indexer_dtype(values, categories) self.name = name self.categories = categories - self.ordered = ordered + self._ordered = ordered return if name is None: @@ -237,8 +235,6 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa cat = values.values if categories is None: categories = cat.categories - if ordered is None: - ordered = cat.ordered values = values.__array__() elif isinstance(values, Index): @@ -264,10 +260,6 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa if categories is None: try: codes, categories = factorize(values, sort=True) - # If the underlying data structure was sortable, and the user doesn't want to - # "forget" this order, the categorical also is sorted/ordered - if ordered is None: - ordered = True except TypeError: codes, categories = factorize(values, sort=False) if ordered: @@ -300,12 +292,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa warn("None of the categories were found in values. Did you mean to use\n" "'Categorical.from_codes(codes, categories)'?", RuntimeWarning) - # if we got categories, we can assume that the order is intended - # if ordered is unspecified - if ordered is None: - ordered = True - - self.ordered = False if ordered is None else ordered + self.set_ordered(ordered, inplace=True) self.categories = categories self.name = name self._codes = _coerce_indexer_dtype(codes, categories) @@ -360,7 +347,7 @@ def from_codes(cls, codes, categories, ordered=False, name=None): An integer array, where each integer points to a category in categories or -1 for NaN categories : index-like The categories for the categorical. Items need to be unique. - ordered : boolean, optional + ordered : boolean, (default False) Whether or not this categorical is treated as a ordered categorical. If not given, the resulting categorical will be unordered. name : str, optional @@ -460,6 +447,61 @@ def _get_levels(self): # TODO: Remove after deprecation period in 2017/ after 0.18 levels = property(fget=_get_levels, fset=_set_levels) + _ordered = None + + def _set_ordered(self, value): + """ Sets the ordered attribute to the boolean value """ + warn("Setting 'ordered' directly is deprecated, use 'set_ordered'", FutureWarning) + self.set_ordered(value, inplace=True) + + def set_ordered(self, value, inplace=False): + """ + Sets the ordered attribute to the boolean value + + Parameters + ---------- + value : boolean to set whether this categorical is ordered (True) or not (False) + inplace : boolean (default: False) + Whether or not to set the ordered attribute inplace or return a copy of this categorical + with ordered set to the value + """ + if not is_bool(value): + raise TypeError("ordered must be a boolean value") + cat = self if inplace else self.copy() + cat._ordered = value + if not inplace: + return cat + + def as_ordered(self, inplace=False): + """ + Sets the Categorical to be ordered + + Parameters + ---------- + inplace : boolean (default: False) + Whether or not to set the ordered attribute inplace or return a copy of this categorical + with ordered set to True + """ + return self.set_ordered(True, inplace=inplace) + + def as_unordered(self, inplace=False): + """ + Sets the Categorical to be unordered + + Parameters + ---------- + inplace : boolean (default: False) + Whether or not to set the ordered attribute inplace or return a copy of this categorical + with ordered set to False + """ + return self.set_ordered(False, inplace=inplace) + + def _get_ordered(self): + """ Gets the ordered attribute """ + return self._ordered + + ordered = property(fget=_get_ordered, fset=_set_ordered) + def set_categories(self, new_categories, ordered=None, rename=False, inplace=False): """ Sets the categories to the specified new_categories. @@ -486,7 +528,7 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal ---------- new_categories : Index-like The categories in new order. - ordered : boolean, optional + ordered : boolean, (default: False) Whether or not the categorical is treated as a ordered categorical. If not given, do not change the ordered information. rename : boolean (default: False) @@ -520,8 +562,9 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal cat._codes = _get_codes_for_values(values, new_categories) cat._categories = new_categories - if not ordered is None: - cat.ordered = ordered + if ordered is None: + ordered = self.ordered + cat.set_ordered(ordered, inplace=True) if not inplace: return cat @@ -765,6 +808,15 @@ def __setstate__(self, state): state['_categories'] = \ self._validate_categories(state.pop('_levels')) + # 0.16.0 ordered change + if '_ordered' not in state: + + # >=15.0 < 0.16.0 + if 'ordered' in state: + state['_ordered'] = state.pop('ordered') + else: + state['_ordered'] = False + for k, v in compat.iteritems(state): setattr(self, k, v) @@ -827,7 +879,8 @@ def searchsorted(self, v, side='left', sorter=None): array([3, 5]) # eggs after donuts, after switching milk and donuts """ if not self.ordered: - raise ValueError("searchsorted requires an ordered Categorical.") + raise ValueError("Categorical not ordered\n" + "you can use .as_ordered() to change the Categorical to an ordered one\n") from pandas.core.series import Series values_as_codes = self.categories.values.searchsorted(Series(v).values, side) @@ -943,6 +996,12 @@ def get_values(self): return np.array(self) + def check_for_ordered(self, op): + """ assert that we are ordered """ + if not self.ordered: + raise TypeError("Categorical is not ordered for operation {op}\n" + "you can use .as_ordered() to change the Categorical to an ordered one\n".format(op=op)) + def argsort(self, ascending=True, **kwargs): """ Implements ndarray.argsort. @@ -954,8 +1013,7 @@ def argsort(self, ascending=True, **kwargs): ------- argsorted : numpy array """ - if not self.ordered: - raise TypeError("Categorical not ordered") + self.check_for_ordered('argsort') result = np.argsort(self._codes.copy(), **kwargs) if not ascending: result = result[::-1] @@ -986,8 +1044,7 @@ def order(self, inplace=False, ascending=True, na_position='last'): -------- Category.sort """ - if not self.ordered: - raise TypeError("Categorical not ordered") + self.check_for_ordered('sort') if na_position not in ['last','first']: raise ValueError('invalid na_position: {!r}'.format(na_position)) @@ -1367,8 +1424,7 @@ def min(self, numeric_only=None, **kwargs): ------- min : the minimum of this `Categorical` """ - if not self.ordered: - raise TypeError("Categorical not ordered") + self.check_for_ordered('min') if numeric_only: good = self._codes != -1 pointer = self._codes[good].min(**kwargs) @@ -1394,8 +1450,7 @@ def max(self, numeric_only=None, **kwargs): ------- max : the maximum of this `Categorical` """ - if not self.ordered: - raise TypeError("Categorical not ordered") + self.check_for_ordered('max') if numeric_only: good = self._codes != -1 pointer = self._codes[good].max(**kwargs) @@ -1498,6 +1553,8 @@ class CategoricalAccessor(PandasDelegate): >>> s.cat.remove_categories(['d']) >>> s.cat.remove_unused_categories() >>> s.cat.set_categories(list('abcde')) + >>> s.cat.as_ordered() + >>> s.cat.as_unordered() """ @@ -1533,7 +1590,9 @@ def _delegate_method(self, name, *args, **kwargs): "add_categories", "remove_categories", "remove_unused_categories", - "set_categories"], + "set_categories", + "as_ordered", + "as_unordered"], typ='method') ##### utility routines ##### diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3db531c946d01..d76c526237f76 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2209,7 +2209,7 @@ def blocks(self): "Internal property, property synonym for as_blocks()" return self.as_blocks() - def astype(self, dtype, copy=True, raise_on_error=True): + def astype(self, dtype, copy=True, raise_on_error=True, **kwargs): """ Cast object to input numpy.dtype Return a copy when copy = True (be really careful with this!) @@ -2218,6 +2218,7 @@ def astype(self, dtype, copy=True, raise_on_error=True): ---------- dtype : numpy.dtype or Python type raise_on_error : raise on invalid input + kwargs : keyword arguments to pass on to the constructor Returns ------- @@ -2225,7 +2226,7 @@ def astype(self, dtype, copy=True, raise_on_error=True): """ mgr = self._data.astype( - dtype=dtype, copy=copy, raise_on_error=raise_on_error) + dtype=dtype, copy=copy, raise_on_error=raise_on_error, **kwargs) return self._constructor(mgr).__finalize__(self) def copy(self, deep=True): diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 05947eec8cfaf..73439fb1e535d 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1923,8 +1923,18 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # a passed Categorical elif isinstance(self.grouper, Categorical): + + # must have an ordered categorical + if self.sort: + if not self.grouper.ordered: + + # technically we cannot group on an unordered Categorical + # but this a user convenience to do so; the ordering + # is preserved and if its a reduction is doesnt't make any difference + pass + # fix bug #GH8868 sort=False being ignored in categorical groupby - if not self.sort: + else: self.grouper = self.grouper.reorder_categories(self.grouper.unique()) self._labels = self.grouper.codes self._group_index = self.grouper.categories diff --git a/pandas/core/index.py b/pandas/core/index.py index 5ba3026a89f7a..195cb21d6564c 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -3628,7 +3628,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None): name = None if names is None else names[0] return Index(arrays[0], name=name) - cats = [Categorical.from_array(arr) for arr in arrays] + cats = [Categorical.from_array(arr, ordered=True) for arr in arrays] levels = [c.categories for c in cats] labels = [c.codes for c in cats] if names is None: @@ -3721,7 +3721,7 @@ def from_product(cls, iterables, sortorder=None, names=None): from pandas.core.categorical import Categorical from pandas.tools.util import cartesian_product - categoricals = [Categorical.from_array(it) for it in iterables] + categoricals = [Categorical.from_array(it, ordered=True) for it in iterables] labels = cartesian_product([c.codes for c in categoricals]) return MultiIndex(levels=[c.categories for c in categoricals], diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 08618d7794b5d..5cb032521d51a 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -367,12 +367,12 @@ def downcast(self, dtypes=None): return blocks - def astype(self, dtype, copy=False, raise_on_error=True, values=None): + def astype(self, dtype, copy=False, raise_on_error=True, values=None, **kwargs): return self._astype(dtype, copy=copy, raise_on_error=raise_on_error, - values=values) + values=values, **kwargs) def _astype(self, dtype, copy=False, raise_on_error=True, values=None, - klass=None): + klass=None, **kwargs): """ Coerce to the new type (if copy=True, return a new copy) raise on an except if raise == True @@ -381,7 +381,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, # may need to convert to categorical # this is only called for non-categoricals if self.is_categorical_astype(dtype): - return make_block(Categorical(self.values), + return make_block(Categorical(self.values, **kwargs), ndim=self.ndim, placement=self.mgr_locs) @@ -1229,8 +1229,8 @@ def to_native_types(self, slicer=None, na_rep='', float_format=None, decimal='.' values = np.array(values, dtype=object) mask = isnull(values) values[mask] = na_rep - - + + if float_format and decimal != '.': formatter = lambda v : (float_format % v).replace('.',decimal,1) elif decimal != '.': @@ -1239,12 +1239,12 @@ def to_native_types(self, slicer=None, na_rep='', float_format=None, decimal='.' formatter = lambda v : float_format % v else: formatter = None - + if formatter: imask = (~mask).ravel() values.flat[imask] = np.array( [formatter(val) for val in values.ravel()[imask]]) - + return values.tolist() def should_store(self, value): diff --git a/pandas/core/panel.py b/pandas/core/panel.py index f77ed59df8ac1..b3fc9aec00271 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -95,8 +95,8 @@ def panel_index(time, panels, names=['time', 'panel']): (1962, 'C')], dtype=object) """ time, panels = _ensure_like_indices(time, panels) - time_factor = Categorical.from_array(time) - panel_factor = Categorical.from_array(panels) + time_factor = Categorical.from_array(time, ordered=True) + panel_factor = Categorical.from_array(panels, ordered=True) labels = [time_factor.codes, panel_factor.codes] levels = [time_factor.categories, panel_factor.categories] diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 1fe23551e886c..291a73778197a 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -157,7 +157,8 @@ def get_result(self): # may need to coerce categoricals here if self.is_categorical is not None: values = [ Categorical.from_array(values[:,i], - categories=self.is_categorical.categories) + categories=self.is_categorical.categories, + ordered=True) for i in range(values.shape[-1]) ] return DataFrame(values, index=index, columns=columns) @@ -1049,7 +1050,7 @@ def check_len(item, name): def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False): # Series avoids inconsistent NaN handling - cat = Categorical.from_array(Series(data)) + cat = Categorical.from_array(Series(data), ordered=True) levels = cat.categories # if all NaN @@ -1117,7 +1118,7 @@ def make_axis_dummies(frame, axis='minor', transform=None): labels = frame.index.labels[num] if transform is not None: mapped_items = items.map(transform) - cat = Categorical.from_array(mapped_items.take(labels)) + cat = Categorical.from_array(mapped_items.take(labels), ordered=True) labels = cat.codes items = cat.categories diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 05510f655f7be..e784934ea28b2 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3637,7 +3637,7 @@ def read(self, where=None, columns=None, **kwargs): if not self.read_axes(where=where, **kwargs): return None - factors = [Categorical.from_array(a.values) for a in self.index_axes] + factors = [Categorical.from_array(a.values, ordered=True) for a in self.index_axes] levels = [f.categories for f in factors] N = [len(f.categories) for f in factors] labels = [f.codes for f in factors] diff --git a/pandas/tests/data/categorical_0_15_2.pickle b/pandas/tests/data/categorical_0_15_2.pickle new file mode 100644 index 0000000000000000000000000000000000000000..25cd862976cab2911db7f759d8368f65e134ce84 GIT binary patch literal 392 zcmZvYO-sW-5Qei!Ozl>!pZ_6V0^YoN@T3b73UidOUz$LZ&1^Sl!Ga#5^jGWJR#fO5 zhJp8;d0yrY&#o^jTecY9blm$Hy z@iD cat_rev_base @@ -333,8 +337,7 @@ def f(): self.assertRaises(TypeError, f) # Only categories with same ordering information can be compared - cat_unorderd = cat.copy() - cat_unorderd.ordered = False + cat_unorderd = cat.set_ordered(False) self.assertFalse((cat > cat).any()) def f(): cat > cat_unorderd @@ -373,7 +376,7 @@ def test_na_flags_int_categories(self): def test_categories_none(self): factor = Categorical(['a', 'b', 'b', 'a', - 'a', 'c', 'c', 'c']) + 'a', 'c', 'c', 'c'], ordered=True) self.assertTrue(factor.equals(self.factor)) def test_describe(self): @@ -454,16 +457,19 @@ def test_big_print(self): def test_empty_print(self): factor = Categorical([], ["a","b","c"], name="cat") - expected = ("[], Name: cat, Categories (3, object): [a < b < c]") + expected = ("[], Name: cat, Categories (3, object): [a, b, c]") # hack because array_repr changed in numpy > 1.6.x actual = repr(factor) - self.assertEqual(actual, expected) factor = Categorical([], ["a","b","c"]) - expected = ("[], Categories (3, object): [a < b < c]") + expected = ("[], Categories (3, object): [a, b, c]") actual = repr(factor) + self.assertEqual(expected, actual) + factor = Categorical([], ["a","b","c"], ordered=True) + expected = ("[], Categories (3, object): [a < b < c]") + actual = repr(factor) self.assertEqual(expected, actual) factor = Categorical([], []) @@ -483,7 +489,7 @@ def test_periodindex(self): idx2 = PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', '2014-03', '2014-01'], freq='M') - cat2 = Categorical.from_array(idx2) + cat2 = Categorical.from_array(idx2, ordered=True) str(cat2) exp_arr = np.array([2, 2, 1, 0, 2, 0],dtype='int64') exp_idx2 = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') @@ -492,7 +498,7 @@ def test_periodindex(self): idx3 = PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09', '2013-08', '2013-07', '2013-05'], freq='M') - cat3 = Categorical.from_array(idx3) + cat3 = Categorical.from_array(idx3, ordered=True) exp_arr = np.array([6, 5, 4, 3, 2, 1, 0],dtype='int64') exp_idx = PeriodIndex(['2013-05', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12'], freq='M') @@ -514,6 +520,60 @@ def f(): s.categories = [1,2] self.assertRaises(ValueError, f) + def test_construction_with_ordered(self): + # GH 9347, 9190 + cat = Categorical([0,1,2]) + self.assertFalse(cat.ordered) + cat = Categorical([0,1,2],ordered=False) + self.assertFalse(cat.ordered) + cat = Categorical([0,1,2],ordered=True) + self.assertTrue(cat.ordered) + + def test_ordered_api(self): + # GH 9347 + cat1 = pd.Categorical(["a","c","b"], ordered=False) + self.assertTrue(cat1.categories.equals(Index(['a','b','c']))) + self.assertFalse(cat1.ordered) + + cat2 = pd.Categorical(["a","c","b"], categories=['b','c','a'], ordered=False) + self.assertTrue(cat2.categories.equals(Index(['b','c','a']))) + self.assertFalse(cat2.ordered) + + cat3 = pd.Categorical(["a","c","b"], ordered=True) + self.assertTrue(cat3.categories.equals(Index(['a','b','c']))) + self.assertTrue(cat3.ordered) + + cat4 = pd.Categorical(["a","c","b"], categories=['b','c','a'], ordered=True) + self.assertTrue(cat4.categories.equals(Index(['b','c','a']))) + self.assertTrue(cat4.ordered) + + def test_set_ordered(self): + + cat = Categorical(["a","b","c","a"], ordered=True) + cat2 = cat.as_unordered() + self.assertFalse(cat2.ordered) + cat2 = cat.as_ordered() + self.assertTrue(cat2.ordered) + cat2.as_unordered(inplace=True) + self.assertFalse(cat2.ordered) + cat2.as_ordered(inplace=True) + self.assertTrue(cat2.ordered) + + self.assertTrue(cat2.set_ordered(True).ordered) + self.assertFalse(cat2.set_ordered(False).ordered) + cat2.set_ordered(True, inplace=True) + self.assertTrue(cat2.ordered) + cat2.set_ordered(False, inplace=True) + self.assertFalse(cat2.ordered) + + # deperecated in v0.16.0 + with tm.assert_produces_warning(FutureWarning): + cat.ordered = False + self.assertFalse(cat.ordered) + with tm.assert_produces_warning(FutureWarning): + cat.ordered = True + self.assertTrue(cat.ordered) + def test_set_categories(self): cat = Categorical(["a","b","c","a"], ordered=True) exp_categories = np.array(["c","b","a"]) @@ -549,7 +609,7 @@ def test_set_categories(self): self.assert_numpy_array_equal(cat.categories, exp_categories) # internals... - c = Categorical([1,2,3,4,1], categories=[1,2,3,4]) + c = Categorical([1,2,3,4,1], categories=[1,2,3,4], ordered=True) self.assert_numpy_array_equal(c._codes, np.array([0,1,2,3,0])) self.assert_numpy_array_equal(c.categories , np.array([1,2,3,4] )) self.assert_numpy_array_equal(c.get_values(), np.array([1,2,3,4,1] )) @@ -560,6 +620,16 @@ def test_set_categories(self): self.assertTrue(c.min(), 4) self.assertTrue(c.max(), 1) + # set_categories should set the ordering if specified + c2 = c.set_categories([4,3,2,1],ordered=False) + self.assertFalse(c2.ordered) + self.assert_numpy_array_equal(c.get_values(), c2.get_values()) + + # set_categories should pass thru the ordering + c2 = c.set_ordered(False).set_categories([4,3,2,1]) + self.assertFalse(c2.ordered) + self.assert_numpy_array_equal(c.get_values(), c2.get_values()) + def test_rename_categories(self): cat = pd.Categorical(["a","b","c","a"]) @@ -928,8 +998,8 @@ def test_searchsorted(self): # https://github.com/pydata/pandas/issues/8420 s1 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk' ]) s2 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ]) - c1 = pd.Categorical(s1) - c2 = pd.Categorical(s2) + c1 = pd.Categorical(s1, ordered=True) + c2 = pd.Categorical(s2, ordered=True) # Single item array res = c1.searchsorted(['bread']) @@ -990,13 +1060,13 @@ def test_deprecated_levels(self): self.assertFalse(LooseVersion(pd.__version__) >= '0.18') def test_datetime_categorical_comparison(self): - dt_cat = pd.Categorical(pd.date_range('2014-01-01', periods=3)) + dt_cat = pd.Categorical(pd.date_range('2014-01-01', periods=3), ordered=True) self.assert_numpy_array_equal(dt_cat > dt_cat[0], [False, True, True]) self.assert_numpy_array_equal(dt_cat[0] < dt_cat, [False, True, True]) def test_reflected_comparison_with_scalars(self): # GH8658 - cat = pd.Categorical([1, 2, 3]) + cat = pd.Categorical([1, 2, 3], ordered=True) self.assert_numpy_array_equal(cat > cat[0], [False, True, True]) self.assert_numpy_array_equal(cat[0] < cat, [False, True, True]) @@ -1153,6 +1223,17 @@ def test_creation_astype(self): df["cats"] = df["cats"].astype("category") tm.assert_frame_equal(exp_df, df) + # with keywords + l = ["a","b","c","a"] + s = pd.Series(l) + exp = pd.Series(Categorical(l, ordered=True)) + res = s.astype('category', ordered=True) + tm.assert_series_equal(res, exp) + + exp = pd.Series(Categorical(l, categories=list('abcdef'), ordered=True)) + res = s.astype('category', categories=list('abcdef'), ordered=True) + tm.assert_series_equal(res, exp) + def test_construction_series(self): l = [1,2,3,1] @@ -1318,7 +1399,7 @@ def test_nan_handling(self): def test_cat_accessor(self): s = Series(Categorical(["a","b",np.nan,"a"])) self.assert_numpy_array_equal(s.cat.categories, np.array(["a","b"])) - self.assertEqual(s.cat.ordered, True) + self.assertEqual(s.cat.ordered, False) exp = Categorical(["a","b",np.nan,"a"], categories=["b","a"]) s.cat.set_categories(["b", "a"], inplace=True) self.assertTrue(s.values.equals(exp)) @@ -1375,8 +1456,10 @@ def test_series_delegations(self): tm.assert_series_equal(s.cat.codes, exp_codes) self.assertEqual(s.cat.ordered, True) - s.cat.ordered = False + s = s.cat.as_unordered() self.assertEqual(s.cat.ordered, False) + s.cat.as_ordered(inplace=True) + self.assertEqual(s.cat.ordered, True) # reorder s = Series(Categorical(["a","b","c","a"], ordered=True)) @@ -1466,11 +1549,11 @@ def test_describe(self): def test_repr(self): a = pd.Series(pd.Categorical([1,2,3,4], name="a")) exp = u("0 1\n1 2\n2 3\n3 4\n" + - "Name: a, dtype: category\nCategories (4, int64): [1 < 2 < 3 < 4]") + "Name: a, dtype: category\nCategories (4, int64): [1, 2, 3, 4]") self.assertEqual(exp, a.__unicode__()) - a = pd.Series(pd.Categorical(["a","b"] *25, name="a")) + a = pd.Series(pd.Categorical(["a","b"] *25, name="a", ordered=True)) exp = u("".join(["%s a\n%s b\n"%(i,i+1) for i in range(0,10,2)]) + "...\n" + "".join(["%s a\n%s b\n"%(i,i+1) for i in range(40,50,2)]) + "Name: a, Length: 50, dtype: category\n" + @@ -1478,7 +1561,7 @@ def test_repr(self): self.assertEqual(exp,a._tidy_repr()) levs = list("abcdefghijklmnopqrstuvwxyz") - a = pd.Series(pd.Categorical(["a","b"], name="a", categories=levs)) + a = pd.Series(pd.Categorical(["a","b"], name="a", categories=levs, ordered=True)) exp = u("0 a\n1 b\n" + "Name: a, dtype: category\n" "Categories (26, object): [a < b < c < d ... w < x < y < z]") @@ -1604,15 +1687,15 @@ def test_value_counts_with_nan(self): def test_groupby(self): - cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a","b","c","d"]) + cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a","b","c","d"], ordered=True) data = DataFrame({"a":[1,1,1,2,2,2,3,4,5], "b":cats}) expected = DataFrame({ 'a' : Series([1,2,4,np.nan],index=Index(['a','b','c','d'],name='b')) }) result = data.groupby("b").mean() tm.assert_frame_equal(result, expected) - raw_cat1 = Categorical(["a","a","b","b"], categories=["a","b","z"]) - raw_cat2 = Categorical(["c","d","c","d"], categories=["c","d","y"]) + raw_cat1 = Categorical(["a","a","b","b"], categories=["a","b","z"], ordered=True) + raw_cat2 = Categorical(["c","d","c","d"], categories=["c","d","y"], ordered=True) df = DataFrame({"A":raw_cat1,"B":raw_cat2, "values":[1,2,3,4]}) # single grouper @@ -1666,8 +1749,8 @@ def f(x): def test_pivot_table(self): - raw_cat1 = Categorical(["a","a","b","b"], categories=["a","b","z"]) - raw_cat2 = Categorical(["c","d","c","d"], categories=["c","d","y"]) + raw_cat1 = Categorical(["a","a","b","b"], categories=["a","b","z"], ordered=True) + raw_cat2 = Categorical(["c","d","c","d"], categories=["c","d","y"], ordered=True) df = DataFrame({"A":raw_cat1,"B":raw_cat2, "values":[1,2,3,4]}) result = pd.pivot_table(df, values='values', index=['A', 'B']) @@ -1704,7 +1787,7 @@ def test_sort(self): self.assert_numpy_array_equal(res.__array__(), exp) raw_cat1 = Categorical(["a","b","c","d"], categories=["a","b","c","d"], ordered=False) - raw_cat2 = Categorical(["a","b","c","d"], categories=["d","c","b","a"]) + raw_cat2 = Categorical(["a","b","c","d"], categories=["d","c","b","a"], ordered=True) s = ["a","b","c","d"] df = DataFrame({"unsort":raw_cat1,"sort":raw_cat2, "string":s, "values":[1,2,3,4]}) @@ -1727,7 +1810,7 @@ def f(): # multi-columns sort # GH 7848 df = DataFrame({"id":[6,5,4,3,2,1], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) - df["grade"] = pd.Categorical(df["raw_grade"]) + df["grade"] = pd.Categorical(df["raw_grade"], ordered=True) df['grade'] = df['grade'].cat.set_categories(['b', 'e', 'a']) # sorts 'grade' according to the order of the categories @@ -2279,10 +2362,10 @@ def test_comparisons(self): tests_data = [(list("abc"), list("cba"), list("bbb")), ([1,2,3], [3,2,1], [2,2,2])] for data , reverse, base in tests_data: - cat_rev = pd.Series(pd.Categorical(data, categories=reverse)) - cat_rev_base = pd.Series(pd.Categorical(base, categories=reverse)) - cat = pd.Series(pd.Categorical(data)) - cat_base = pd.Series(pd.Categorical(base, categories=cat.cat.categories)) + cat_rev = pd.Series(pd.Categorical(data, categories=reverse, ordered=True)) + cat_rev_base = pd.Series(pd.Categorical(base, categories=reverse, ordered=True)) + cat = pd.Series(pd.Categorical(data, ordered=True)) + cat_base = pd.Series(pd.Categorical(base, categories=cat.cat.categories, ordered=True)) s = Series(base) a = np.array(base) @@ -2616,7 +2699,8 @@ def test_cat_tab_completition(self): # test the tab completion display ok_for_cat = ['categories','codes','ordered','set_categories', 'add_categories', 'remove_categories', 'rename_categories', - 'reorder_categories', 'remove_unused_categories'] + 'reorder_categories', 'remove_unused_categories', + 'as_ordered', 'as_unordered'] def get_dir(s): results = [ r for r in s.cat.__dir__() if not r.startswith('_') ] return list(sorted(set(results))) @@ -2651,6 +2735,23 @@ def test_pickle_v0_14_1(self): # self.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) + def test_pickle_v0_15_2(self): + # ordered -> _ordered + # GH 9347 + + cat = pd.Categorical(values=['a', 'b', 'c'], + categories=['a', 'b', 'c', 'd'], + name='foobar', ordered=False) + pickle_path = os.path.join(tm.get_data_path(), + 'categorical_0_15_2.pickle') + # This code was executed once on v0.15.2 to generate the pickle: + # + # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], + # name='foobar') + # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) + # + self.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) + if __name__ == '__main__': import nose diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 9498c7e1207c3..17ae8c66e26dd 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -3274,7 +3274,7 @@ def test_groupby_sort_categorical(self): ['(2.5, 5]', 4, 50], ['(0, 2.5]', 1, 60], ['(5, 7.5]', 7, 70]], columns=['range', 'foo', 'bar']) - df['range'] = Categorical(df['range']) + df['range'] = Categorical(df['range'],ordered=True) index = Index(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', '(7.5, 10]'], dtype='object') index.name = 'range' result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) @@ -3288,6 +3288,22 @@ def test_groupby_sort_categorical(self): assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) assert_frame_equal(result_nosort, df.groupby(col, sort=False).first()) + df['range'] = Categorical(df['range'],ordered=False) + index = Index(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', '(7.5, 10]'], dtype='object') + index.name = 'range' + result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) + result_sort.index = index + index = Index(['(7.5, 10]', '(2.5, 5]', '(5, 7.5]', '(0, 2.5]'], dtype='object') + index.name = 'range' + result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], index=index, columns=['foo', 'bar']) + result_nosort.index = index + + col = 'range' + + #### this is an unordered categorical, but we allow this #### + assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) + assert_frame_equal(result_nosort, df.groupby(col, sort=False).first()) + def test_groupby_sort_multiindex_series(self): # series multiindex groupby sort argument was not being passed through _compress_group_index @@ -3310,7 +3326,7 @@ def test_groupby_categorical(self): levels = ['foo', 'bar', 'baz', 'qux'] codes = np.random.randint(0, 4, size=100) - cats = Categorical.from_codes(codes, levels, name='myfactor') + cats = Categorical.from_codes(codes, levels, name='myfactor', ordered=True) data = DataFrame(np.random.randn(100, 4)) @@ -3338,7 +3354,7 @@ def test_groupby_datetime_categorical(self): levels = pd.date_range('2014-01-01', periods=4) codes = np.random.randint(0, 4, size=100) - cats = Categorical.from_codes(codes, levels, name='myfactor') + cats = Categorical.from_codes(codes, levels, name='myfactor', ordered=True) data = DataFrame(np.random.randn(100, 4)) @@ -3485,21 +3501,21 @@ def test_groupby_categorical_no_compress(self): data = Series(np.random.randn(9)) codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) - cats = Categorical.from_codes(codes, [0, 1, 2]) + cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True) result = data.groupby(cats).mean() exp = data.groupby(codes).mean() assert_series_equal(result, exp) codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) - cats = Categorical.from_codes(codes, [0, 1, 2, 3]) + cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True) result = data.groupby(cats).mean() exp = data.groupby(codes).mean().reindex(cats.categories) assert_series_equal(result, exp) cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], - categories=["a","b","c","d"]) + categories=["a","b","c","d"], ordered=True) data = DataFrame({"a":[1,1,1,2,2,2,3,4,5], "b":cats}) result = data.groupby("b").mean() @@ -4991,7 +5007,7 @@ def test_transform_doesnt_clobber_ints(self): def test_groupby_categorical_two_columns(self): # https://github.com/pydata/pandas/issues/8138 - d = {'cat': pd.Categorical(["a","b","a","b"], categories=["a", "b", "c"]), + d = {'cat': pd.Categorical(["a","b","a","b"], categories=["a", "b", "c"], ordered=True), 'ints': [1, 1, 2, 2],'val': [10, 20, 30, 40]} test = pd.DataFrame(d) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 454b0f79310e4..c7c578232cd0f 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -1037,7 +1037,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): names = [None] * len(zipped) if levels is None: - levels = [Categorical.from_array(zp).categories for zp in zipped] + levels = [Categorical.from_array(zp, ordered=True).categories for zp in zipped] else: levels = [_ensure_index(x) for x in levels] else: @@ -1075,7 +1075,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): levels.extend(concat_index.levels) label_list.extend(concat_index.labels) else: - factor = Categorical.from_array(concat_index) + factor = Categorical.from_array(concat_index, ordered=True) levels.append(factor.categories) label_list.append(factor.codes) From 87fec5b527dcbb2dd99592e2d7e298cda5ac225d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 10 Mar 2015 19:12:03 -0400 Subject: [PATCH 2/2] allow unordered categorical to order/sort (exclude only min/max) --- doc/source/categorical.rst | 8 ++------ doc/source/whatsnew/v0.16.0.txt | 16 ---------------- pandas/core/categorical.py | 2 -- pandas/tests/test_categorical.py | 18 +++++++++++------- 4 files changed, 13 insertions(+), 31 deletions(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index bb246c8aff0a4..6ce93326f0e16 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -280,16 +280,12 @@ Sorting and Order The default for construction has change in v0.16.0 to ``ordered=False``, from the prior implicit ``ordered=True`` If categorical data is ordered (``s.cat.ordered == True``), then the order of the categories has a -meaning and certain operations are possible. If the categorical is unordered, a `TypeError` is -raised. +meaning and certain operations are possible. If the categorical is unordered, ``.min()/.max()`` will raise a `TypeError`. .. ipython:: python s = Series(Categorical(["a","b","c","a"], ordered=False)) - try: - s.sort() - except TypeError as e: - print("TypeError: " + str(e)) + s.sort() s = Series(["a","b","c","a"]).astype('category', ordered=True) s.sort() s diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index 765324b3aceb4..882dcce8c8164 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -428,22 +428,6 @@ For ease of creation of series of categorical data, we have added the ability to s = Series(["a","b","c","a"]).astype('category',categories=list('abcdef'),ordered=False) s -.. warning:: - - This simple API change may have suprising effects if a user is relying on the previous defaulted behavior implicity. In particular, - sorting operations with a ``Categorical`` will now raise an error: - - .. code-block:: python - - In [1]: df = DataFrame({ 'A' : Series(list('aabc')).astype('category'), 'B' : np.arange(4) }) - - In [2]: df['A'].order() - TypeError: Categorical is not ordered for operation argsort - you can use .as_ordered() to change the Categorical to an ordered one - - The solution is to make 'A' orderable, e.g. ``df['A'] = df['A'].cat.as_ordered()`` - - Indexing Changes ~~~~~~~~~~~~~~~~ diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index ee76527b89974..991678a8e7d79 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1013,7 +1013,6 @@ def argsort(self, ascending=True, **kwargs): ------- argsorted : numpy array """ - self.check_for_ordered('argsort') result = np.argsort(self._codes.copy(), **kwargs) if not ascending: result = result[::-1] @@ -1044,7 +1043,6 @@ def order(self, inplace=False, ascending=True, na_position='last'): -------- Category.sort """ - self.check_for_ordered('sort') if na_position not in ['last','first']: raise ValueError('invalid na_position: {!r}'.format(na_position)) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 55589375ffe12..a7f241168ea73 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -918,9 +918,11 @@ def test_mode(self): def test_sort(self): - # unordered cats are not sortable + # unordered cats are sortable cat = Categorical(["a","b","b","a"], ordered=False) - self.assertRaises(TypeError, lambda : cat.sort()) + cat.order() + cat.sort() + cat = Categorical(["a","c","b","d"], ordered=True) # order @@ -1767,9 +1769,12 @@ def test_count(self): def test_sort(self): - # unordered cats are not sortable cat = Series(Categorical(["a","b","b","a"], ordered=False)) - self.assertRaises(TypeError, lambda : cat.sort()) + + # sort in the categories order + expected = Series(Categorical(["a","a","b","b"], ordered=False),index=[0,3,1,2]) + result = cat.order() + tm.assert_series_equal(result, expected) cat = Series(Categorical(["a","c","b","d"], ordered=True)) @@ -1803,9 +1808,8 @@ def test_sort(self): self.assertEqual(res["sort"].dtype, "category") self.assertEqual(res["unsort"].dtype, "category") - def f(): - df.sort(columns=["unsort"], ascending=False) - self.assertRaises(TypeError, f) + # unordered cat, but we allow this + df.sort(columns=["unsort"], ascending=False) # multi-columns sort # GH 7848