From 065415dc95f9569cdc216dab8c03ea7d4667e24d Mon Sep 17 00:00:00 2001 From: Jan Schulz Date: Wed, 11 Nov 2015 22:57:59 +0100 Subject: [PATCH] Prevent adding new attributes to the accessors .str, .dt and .cat This commit is mostly for the benefit of users who misspell things on the accessors. Assigning to `Series.str`, `Series.dt`, or `Series.cat` was not failing although you couldn't get the value back: ``` In[10]: a = pandas.Series(pandas.Categorical(list("abc"))) In[11]: a.cat.labels = [1,2] In[12]: a.cat.labels AttributeError: 'CategoricalAccessor' object has no attribute 'labels' ``` Now we fail early: ``` In[10]: a = pandas.Series(pandas.Categorical(list("abc"))) In[11]: a.cat.labels = [1,2] AttributeError: You cannot add any new attribute 'labels' ``` Refactor/add a StringAccessorMixin to break a import cycle. --- doc/source/whatsnew/v0.17.1.txt | 2 ++ pandas/core/base.py | 61 +++++++++++++------------------- pandas/core/categorical.py | 5 +-- pandas/core/index.py | 3 +- pandas/core/series.py | 3 +- pandas/core/strings.py | 45 +++++++++++++++++++++-- pandas/tests/test_base.py | 21 ++++++++++- pandas/tests/test_categorical.py | 6 ++++ pandas/tests/test_series.py | 6 ++++ pandas/tests/test_strings.py | 6 ++++ pandas/tseries/common.py | 5 +-- 11 files changed, 118 insertions(+), 45 deletions(-) diff --git a/doc/source/whatsnew/v0.17.1.txt b/doc/source/whatsnew/v0.17.1.txt index 28819c522c696..2fde26451ad5c 100755 --- a/doc/source/whatsnew/v0.17.1.txt +++ b/doc/source/whatsnew/v0.17.1.txt @@ -76,6 +76,8 @@ Bug Fixes - Bug in merging ``datetime64[ns, tz]`` dtypes (:issue:`11405`) - Bug in ``HDFStore.select`` when comparing with a numpy scalar in a where clause (:issue:`11283`) - Bug in using ``DataFrame.ix`` with a multi-index indexer(:issue:`11372`) +- Prevent adding new attributes to the accessors ``.str``, ``.dt`` and ``.cat``. Retrieving such + a value was not possible, so error out on setting it. (:issue:`10673`) - Bug in tz-conversions with an ambiguous time and ``.dt`` accessors (:issue:`11295`) diff --git a/pandas/core/base.py b/pandas/core/base.py index d3850be13b6f0..0f5c43b8e1fff 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -7,7 +7,6 @@ import pandas.core.nanops as nanops import pandas.lib as lib from pandas.util.decorators import Appender, cache_readonly, deprecate_kwarg -from pandas.core.strings import StringMethods from pandas.core.common import AbstractMethodError _shared_docs = dict() @@ -111,6 +110,31 @@ def _reset_cache(self, key=None): else: self._cache.pop(key, None) +class NoNewAttributesMixin(object): + """Mixin which prevents adding new attributes. + + Prevents additional attributes via xxx.attribute = "something" after a call to + `self.__freeze()`. Mainly used to prevent the user from using wrong attrirbutes + on a accessor (`Series.cat/.str/.dt`). + + If you really want to add a new attribute at a later time, you need to use + `object.__setattr__(self, key, value)`. + """ + + def _freeze(self): + """Prevents setting additional attributes""" + object.__setattr__(self, "__frozen", True) + + + # prevent adding any attribute via s.xxx.new_attribute = ... + def __setattr__(self, key, value): + # _cache is used by a decorator + # dict lookup instead of getattr as getattr is false for getter which error + if getattr(self, "__frozen", False) and not (key in type(self).__dict__ or key == "_cache"): + raise AttributeError( "You cannot add any new attribute '{key}'".format(key=key)) + object.__setattr__(self, key, value) + + class PandasDelegate(PandasObject): """ an abstract base class for delegating methods/properties """ @@ -517,41 +541,6 @@ def searchsorted(self, key, side='left'): #### needs tests/doc-string return self.values.searchsorted(key, side=side) - # string methods - def _make_str_accessor(self): - from pandas.core.series import Series - from pandas.core.index import Index - if isinstance(self, Series) and not com.is_object_dtype(self.dtype): - # this really should exclude all series with any non-string values, - # but that isn't practical for performance reasons until we have a - # str dtype (GH 9343) - raise AttributeError("Can only use .str accessor with string " - "values, which use np.object_ dtype in " - "pandas") - elif isinstance(self, Index): - # see scc/inferrence.pyx which can contain string values - allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer') - if self.inferred_type not in allowed_types: - message = ("Can only use .str accessor with string values " - "(i.e. inferred_type is 'string', 'unicode' or 'mixed')") - raise AttributeError(message) - if self.nlevels > 1: - message = "Can only use .str accessor with Index, not MultiIndex" - raise AttributeError(message) - return StringMethods(self) - - str = AccessorProperty(StringMethods, _make_str_accessor) - - def _dir_additions(self): - return set() - - def _dir_deletions(self): - try: - getattr(self, 'str') - except AttributeError: - return set(['str']) - return set() - _shared_docs['drop_duplicates'] = ( """Return %(klass)s with duplicate values removed diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index e304684036766..1d9d347f5e5a7 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -8,7 +8,7 @@ from pandas.compat import u from pandas.core.algorithms import factorize -from pandas.core.base import PandasObject, PandasDelegate +from pandas.core.base import PandasObject, PandasDelegate, NoNewAttributesMixin import pandas.core.common as com from pandas.core.missing import interpolate_2d from pandas.util.decorators import cache_readonly, deprecate_kwarg @@ -1717,7 +1717,7 @@ def repeat(self, repeats): ##### The Series.cat accessor ##### -class CategoricalAccessor(PandasDelegate): +class CategoricalAccessor(PandasDelegate, NoNewAttributesMixin): """ Accessor object for categorical properties of the Series values. @@ -1742,6 +1742,7 @@ class CategoricalAccessor(PandasDelegate): def __init__(self, values, index): self.categorical = values self.index = index + self._freeze() def _delegate_property_get(self, name): return getattr(self.categorical, name) diff --git a/pandas/core/index.py b/pandas/core/index.py index 644b6a411c79a..b55d583ce63cf 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -25,6 +25,7 @@ _values_from_object, is_float, is_integer, is_iterator, is_categorical_dtype, _ensure_object, _ensure_int64, is_bool_indexer, is_list_like, is_bool_dtype, is_null_slice, is_integer_dtype) +from pandas.core.strings import StringAccessorMixin from pandas.core.config import get_option from pandas.io.common import PerformanceWarning @@ -64,7 +65,7 @@ def _new_Index(cls, d): and breaks __new__ """ return cls.__new__(cls, **d) -class Index(IndexOpsMixin, PandasObject): +class Index(IndexOpsMixin, StringAccessorMixin, PandasObject): """ Immutable ndarray implementing an ordered, sliceable set. The basic object diff --git a/pandas/core/series.py b/pandas/core/series.py index b12a31d64eaf7..7eb8859979ad9 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -32,6 +32,7 @@ from pandas.core import generic, base from pandas.core.internals import SingleBlockManager from pandas.core.categorical import Categorical, CategoricalAccessor +import pandas.core.strings as strings from pandas.tseries.common import (maybe_to_datetimelike, CombinedDatetimelikeProperties) from pandas.tseries.index import DatetimeIndex @@ -85,7 +86,7 @@ def wrapper(self): # Series class -class Series(base.IndexOpsMixin, generic.NDFrame): +class Series(base.IndexOpsMixin, strings.StringAccessorMixin, generic.NDFrame,): """ One-dimensional ndarray with axis labels (including time series). diff --git a/pandas/core/strings.py b/pandas/core/strings.py index dddc1f4898908..f1ff7e2178a04 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1,8 +1,10 @@ import numpy as np from pandas.compat import zip -from pandas.core.common import isnull, _values_from_object, is_bool_dtype, is_list_like +from pandas.core.common import (isnull, _values_from_object, is_bool_dtype, is_list_like, + is_categorical_dtype, is_object_dtype) import pandas.compat as compat +from pandas.core.base import AccessorProperty, NoNewAttributesMixin from pandas.util.decorators import Appender, deprecate_kwarg import re import pandas.lib as lib @@ -1044,7 +1046,7 @@ def do_copy(target): return do_copy -class StringMethods(object): +class StringMethods(NoNewAttributesMixin): """ Vectorized string functions for Series and Index. NAs stay NA unless @@ -1059,6 +1061,7 @@ class StringMethods(object): def __init__(self, series): self.series = series + self._freeze() def __getitem__(self, key): if isinstance(key, slice): @@ -1542,3 +1545,41 @@ def rindex(self, sub, start=0, end=None): isdecimal = _noarg_wrapper(lambda x: compat.u_safe(x).isdecimal(), docstring=_shared_docs['ismethods'] % _shared_docs['isdecimal']) + +class StringAccessorMixin(object): + """ Mixin to add a `.str` acessor to the class.""" + + # string methods + def _make_str_accessor(self): + from pandas.core.series import Series + from pandas.core.index import Index + if isinstance(self, Series) and not is_object_dtype(self.dtype): + # this really should exclude all series with any non-string values, + # but that isn't practical for performance reasons until we have a + # str dtype (GH 9343) + raise AttributeError("Can only use .str accessor with string " + "values, which use np.object_ dtype in " + "pandas") + elif isinstance(self, Index): + # see scc/inferrence.pyx which can contain string values + allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer') + if self.inferred_type not in allowed_types: + message = ("Can only use .str accessor with string values " + "(i.e. inferred_type is 'string', 'unicode' or 'mixed')") + raise AttributeError(message) + if self.nlevels > 1: + message = "Can only use .str accessor with Index, not MultiIndex" + raise AttributeError(message) + return StringMethods(self) + + str = AccessorProperty(StringMethods, _make_str_accessor) + + def _dir_additions(self): + return set() + + def _dir_deletions(self): + try: + getattr(self, 'str') + except AttributeError: + return set(['str']) + return set() diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index fb255f300ebdd..fa60633a70a53 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -6,7 +6,7 @@ import pandas.compat as compat import pandas as pd from pandas.compat import u, StringIO -from pandas.core.base import FrozenList, FrozenNDArray, PandasDelegate +from pandas.core.base import FrozenList, FrozenNDArray, PandasDelegate, NoNewAttributesMixin import pandas.core.common as com from pandas.tseries.base import DatetimeIndexOpsMixin from pandas.util.testing import assertRaisesRegexp, assertIsInstance @@ -825,6 +825,25 @@ def test_lookup_nan(self): self.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs))) +class TestNoNewAttributesMixin(tm.TestCase): + + def test_mixin(self): + class T(NoNewAttributesMixin): + pass + + t = T() + self.assertFalse(hasattr(t, "__frozen")) + t.a = "test" + self.assertEqual(t.a, "test") + t._freeze() + #self.assertTrue("__frozen" not in dir(t)) + self.assertIs(getattr(t, "__frozen"), True) + def f(): + t.b = "test" + self.assertRaises(AttributeError, f) + self.assertFalse(hasattr(t, "b")) + + if __name__ == '__main__': import nose diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 1d143236e285b..6f1311b44d6b5 100755 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -3625,6 +3625,12 @@ def test_cat_accessor_api(self): invalid.cat self.assertFalse(hasattr(invalid, 'cat')) + def test_cat_accessor_no_new_attributes(self): + # https://github.com/pydata/pandas/issues/10673 + c = Series(list('aabbcde')).astype('category') + with tm.assertRaisesRegexp(AttributeError, "You cannot add any new attribute"): + c.cat.xlabel = "a" + def test_pickle_v0_14_1(self): # we have the name warning diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 1c8cbac60e7c7..60fe2bece628d 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -247,6 +247,12 @@ def f(): s.dt.hour[0] = 5 self.assertRaises(com.SettingWithCopyError, f) + def test_dt_accessor_no_new_attributes(self): + # https://github.com/pydata/pandas/issues/10673 + s = Series(date_range('20130101',periods=5,freq='D')) + with tm.assertRaisesRegexp(AttributeError, "You cannot add any new attribute"): + s.dt.xlabel = "a" + def test_strftime(self): # GH 10086 s = Series(date_range('20130101', periods=5)) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 31623d5c277c4..1017704379811 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -2034,6 +2034,12 @@ def test_index_str_accessor_visibility(self): with self.assertRaisesRegexp(AttributeError, message): idx.str + def test_str_accessor_no_new_attributes(self): + # https://github.com/pydata/pandas/issues/10673 + s = Series(list('aabbcde')) + with tm.assertRaisesRegexp(AttributeError, "You cannot add any new attribute"): + s.str.xlabel = "a" + def test_method_on_bytes(self): lhs = Series(np.array(list('abc'), 'S1').astype(object)) rhs = Series(np.array(list('def'), 'S1').astype(object)) diff --git a/pandas/tseries/common.py b/pandas/tseries/common.py index dcfe809074a0b..171f72d37cdd8 100644 --- a/pandas/tseries/common.py +++ b/pandas/tseries/common.py @@ -1,7 +1,7 @@ ## datetimelike delegation ## import numpy as np -from pandas.core.base import PandasDelegate +from pandas.core.base import PandasDelegate, NoNewAttributesMixin from pandas.core import common as com from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex @@ -59,12 +59,13 @@ def maybe_to_datetimelike(data, copy=False): raise TypeError("cannot convert an object of type {0} to a datetimelike index".format(type(data))) -class Properties(PandasDelegate): +class Properties(PandasDelegate, NoNewAttributesMixin): def __init__(self, values, index, name): self.values = values self.index = index self.name = name + self._freeze() def _delegate_property_get(self, name): from pandas import Series