diff --git a/doc/source/api.rst b/doc/source/api.rst index 1c80712e82d49..f6dfd5cfaf0e7 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -348,7 +348,6 @@ Computations / Descriptive Stats Series.median Series.min Series.mode - Series.nunique Series.pct_change Series.prod Series.quantile @@ -356,8 +355,9 @@ Computations / Descriptive Stats Series.skew Series.std Series.sum - Series.unique Series.var + Series.unique + Series.nunique Series.value_counts Reindexing / Selection / Label manipulation @@ -1053,6 +1053,8 @@ Modifying and Computations Index.repeat Index.set_names Index.unique + Index.nunique + Index.value_counts Conversion ~~~~~~~~~~ diff --git a/doc/source/release.rst b/doc/source/release.rst index 7188851214f7f..3f3cfe5dd4359 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -159,6 +159,8 @@ API Changes - Arithmetic ops are now disallowed when passed two bool dtype Series or DataFrames (:issue:`6762`). +- Added ``nunique`` and ``value_counts`` functions to ``Index`` for counting unique elements. (:issue:`6734`) + Deprecations ~~~~~~~~~~~~ diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 23ab8f10116c1..58eec9fa0f528 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -199,6 +199,7 @@ API changes - ``Series.iteritems()`` is now lazy (returns an iterator rather than a list). This was the documented behavior prior to 0.14. (:issue:`6760`) - ``Panel.shift`` now uses ``NDFrame.shift``. It no longer drops the ``nan`` data and retains its original shape. (:issue:`4867`) +- Added ``nunique`` and ``value_counts`` functions to ``Index`` for counting unique elements. (:issue:`6734`) MultiIndexing Using Slicers ~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/base.py b/pandas/core/base.py index f9bf4ca4ce91d..ec6a4ffbcefbb 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -269,6 +269,56 @@ def min(self): self._is_allowed_index_op('min') return self.values.min() + def value_counts(self, normalize=False, sort=True, ascending=False, + bins=None): + """ + Returns object containing counts of unique values. The resulting object + will be in descending order so that the first element is the most + frequently-occurring element. Excludes NA values. + + Parameters + ---------- + normalize : boolean, default False + If True then the object returned will contain the relative + frequencies of the unique values. + sort : boolean, default True + Sort by values + ascending : boolean, default False + Sort in ascending order + bins : integer, optional + Rather than count values, group them into half-open bins, + a convenience for pd.cut, only works with numeric data + + Returns + ------- + counts : Series + """ + from pandas.core.algorithms import value_counts + return value_counts(self.values, sort=sort, ascending=ascending, + normalize=normalize, bins=bins) + + def unique(self): + """ + Return array of unique values in the object. Significantly faster than + numpy.unique. Includes NA values. + + Returns + ------- + uniques : ndarray + """ + from pandas.core.nanops import unique1d + return unique1d(self.values) + + def nunique(self): + """ + Return count of unique elements in the object. Excludes NA values. + + Returns + ------- + nunique : int + """ + return len(self.value_counts()) + date = _field_accessor('date','Returns numpy array of datetime.date. The date part of the Timestamps') time = _field_accessor('time','Returns numpy array of datetime.time. The time part of the Timestamps') year = _field_accessor('year', "The year of the datetime") diff --git a/pandas/core/index.py b/pandas/core/index.py index bae4a2c455ec6..b2b0764b81d43 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1102,18 +1102,6 @@ def sym_diff(self, other, result_name=None): the_diff = sorted(set((self - other) + (other - self))) return Index(the_diff, name=result_name) - def unique(self): - """ - Return array of unique values in the Index. Significantly faster than - numpy.unique - - Returns - ------- - uniques : ndarray - """ - from pandas.core.nanops import unique1d - return unique1d(self.values) - def get_loc(self, key): """ Get integer location for requested label diff --git a/pandas/core/series.py b/pandas/core/series.py index 4ab7855ec2f84..544d327c9a13d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1095,34 +1095,6 @@ def count(self, level=None): return notnull(_values_from_object(self)).sum() - def value_counts(self, normalize=False, sort=True, ascending=False, - bins=None): - """ - Returns Series containing counts of unique values. The resulting Series - will be in descending order so that the first element is the most - frequently-occurring element. Excludes NA values - - Parameters - ---------- - normalize : boolean, default False - If True then the Series returned will contain the relative - frequencies of the unique values. - sort : boolean, default True - Sort by values - ascending : boolean, default False - Sort in ascending order - bins : integer, optional - Rather than count values, group them into half-open bins, - a convenience for pd.cut, only works with numeric data - - Returns - ------- - counts : Series - """ - from pandas.core.algorithms import value_counts - return value_counts(self.values, sort=sort, ascending=ascending, - normalize=normalize, bins=bins) - def mode(self): """Returns the mode(s) of the dataset. @@ -1143,27 +1115,6 @@ def mode(self): from pandas.core.algorithms import mode return mode(self) - def unique(self): - """ - Return array of unique values in the Series. Significantly faster than - numpy.unique - - Returns - ------- - uniques : ndarray - """ - return nanops.unique1d(self.values) - - def nunique(self): - """ - Return count of unique elements in the Series - - Returns - ------- - nunique : int - """ - return len(self.value_counts()) - def drop_duplicates(self, take_last=False, inplace=False): """ Return Series with duplicate values removed diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 32416dc975e64..6f7d22e6c50fe 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1,11 +1,12 @@ import re +from datetime import timedelta import numpy as np import pandas.compat as compat import pandas as pd -from pandas.compat import u +from pandas.compat import u, StringIO from pandas.core.base import FrozenList, FrozenNDArray from pandas.util.testing import assertRaisesRegexp, assert_isinstance -from pandas import Series, Index, DatetimeIndex, PeriodIndex +from pandas import Series, Index, Int64Index, DatetimeIndex, PeriodIndex from pandas import _np_version_under1p7 import nose @@ -130,6 +131,7 @@ def setUp(self): self.int_index = tm.makeIntIndex(10) self.float_index = tm.makeFloatIndex(10) self.dt_index = tm.makeDateIndex(10) + self.dt_tz_index = tm.makeDateIndex(10).tz_localize(tz='US/Eastern') self.period_index = tm.makePeriodIndex(10) self.string_index = tm.makeStringIndex(10) @@ -137,10 +139,12 @@ def setUp(self): self.int_series = Series(arr, index=self.int_index) self.float_series = Series(arr, index=self.int_index) self.dt_series = Series(arr, index=self.dt_index) + self.dt_tz_series = self.dt_tz_index.to_series(keep_tz=True) self.period_series = Series(arr, index=self.period_index) self.string_series = Series(arr, index=self.string_index) - self.objs = [ getattr(self,"{0}_{1}".format(t,f)) for t in ['int','float','dt','period','string'] for f in ['index','series'] ] + types = ['int','float','dt', 'dt_tz', 'period','string'] + self.objs = [ getattr(self,"{0}_{1}".format(t,f)) for t in types for f in ['index','series'] ] def check_ops_properties(self, props, filter=None, ignore_failures=False): for op in props: @@ -193,7 +197,207 @@ def test_ops(self): for o in self.objs: result = getattr(o,op)() expected = getattr(o.values,op)() - self.assertEqual(result, expected) + try: + self.assertEqual(result, expected) + except ValueError: + # comparing tz-aware series with np.array results in ValueError + expected = expected.astype('M8[ns]').astype('int64') + self.assertEqual(result.value, expected) + + def test_value_counts_unique_nunique(self): + for o in self.objs: + klass = type(o) + values = o.values + + # create repeated values, 'n'th element is repeated by n+1 times + if isinstance(o, PeriodIndex): + # freq must be specified because repeat makes freq ambiguous + o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq) + else: + o = klass(np.repeat(values, range(1, len(o) + 1))) + + expected_s = Series(range(10, 0, -1), index=values[::-1]) + tm.assert_series_equal(o.value_counts(), expected_s) + + if isinstance(o, DatetimeIndex): + # DatetimeIndex.unique returns DatetimeIndex + self.assert_(o.unique().equals(klass(values))) + else: + self.assert_numpy_array_equal(o.unique(), values) + + self.assertEqual(o.nunique(), len(np.unique(o.values))) + + for null_obj in [np.nan, None]: + for o in self.objs: + klass = type(o) + values = o.values + + if o.values.dtype == 'int64': + # skips int64 because it doesn't allow to include nan or None + continue + + if o.values.dtype == 'datetime64[ns]' and _np_version_under1p7: + # Unable to assign None + continue + + values[0:2] = null_obj + + # create repeated values, 'n'th element is repeated by n+1 times + if isinstance(o, PeriodIndex): + o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq) + else: + o = klass(np.repeat(values, range(1, len(o) + 1))) + + if isinstance(o, DatetimeIndex): + # DatetimeIndex: nan is casted to Nat and included + expected_s = Series(list(range(10, 2, -1)) + [3], index=values[9:0:-1]) + else: + # nan is excluded + expected_s = Series(range(10, 2, -1), index=values[9:1:-1]) + + tm.assert_series_equal(o.value_counts(), expected_s) + + # numpy_array_equal cannot compare arrays includes nan + result = o.unique() + self.assert_numpy_array_equal(result[1:], values[2:]) + + if isinstance(o, DatetimeIndex): + self.assert_(result[0] is pd.NaT) + else: + self.assert_(pd.isnull(result[0])) + + if isinstance(o, DatetimeIndex): + self.assertEqual(o.nunique(), 9) + else: + self.assertEqual(o.nunique(), 8) + + def test_value_counts_inferred(self): + klasses = [Index, Series] + for klass in klasses: + s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] + s = klass(s_values) + expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) + tm.assert_series_equal(s.value_counts(), expected) + + self.assert_numpy_array_equal(s.unique(), np.unique(s_values)) + self.assertEquals(s.nunique(), 4) + # don't sort, have to sort after the fact as not sorting is platform-dep + hist = s.value_counts(sort=False) + hist.sort() + expected = Series([3, 1, 4, 2], index=list('acbd')) + expected.sort() + tm.assert_series_equal(hist, expected) + + # sort ascending + hist = s.value_counts(ascending=True) + expected = Series([1, 2, 3, 4], index=list('cdab')) + tm.assert_series_equal(hist, expected) + + # relative histogram. + hist = s.value_counts(normalize=True) + expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c']) + tm.assert_series_equal(hist, expected) + + # bins + self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1) + + s1 = Series([1, 1, 2, 3]) + res1 = s1.value_counts(bins=1) + exp1 = Series({0.998: 4}) + tm.assert_series_equal(res1, exp1) + res1n = s1.value_counts(bins=1, normalize=True) + exp1n = Series({0.998: 1.0}) + tm.assert_series_equal(res1n, exp1n) + + self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3])) + self.assertEquals(s1.nunique(), 3) + + res4 = s1.value_counts(bins=4) + exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0]) + tm.assert_series_equal(res4, exp4) + res4n = s1.value_counts(bins=4, normalize=True) + exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0]) + tm.assert_series_equal(res4n, exp4n) + + # handle NA's properly + s_values = ['a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b'] + s = klass(s_values) + expected = Series([4, 3, 2], index=['b', 'a', 'd']) + tm.assert_series_equal(s.value_counts(), expected) + + self.assert_numpy_array_equal(s.unique(), np.array(['a', 'b', np.nan, 'd'], dtype='O')) + self.assertEquals(s.nunique(), 3) + + s = klass({}) + expected = Series([], dtype=np.int64) + tm.assert_series_equal(s.value_counts(), expected) + self.assert_numpy_array_equal(s.unique(), np.array([])) + self.assertEquals(s.nunique(), 0) + + # GH 3002, datetime64[ns] + txt = "\n".join(['xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG', + 'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM']) + f = StringIO(txt) + df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], + parse_dates=["dt"]) + + s = klass(df['dt'].copy()) + + idx = pd.to_datetime(['2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', '2009-01-01 00:00:00X']) + expected_s = Series([3, 2, 1], index=idx) + tm.assert_series_equal(s.value_counts(), expected_s) + + expected = np.array(['2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z'], + dtype='datetime64[ns]') + if isinstance(s, DatetimeIndex): + expected = DatetimeIndex(expected) + self.assert_(s.unique().equals(expected)) + else: + self.assert_numpy_array_equal(s.unique(), expected) + + self.assertEquals(s.nunique(), 3) + + # with NaT + s = df['dt'].copy() + s = klass([v for v in s.values] + [pd.NaT]) + + result = s.value_counts() + self.assertEqual(result.index.dtype, 'datetime64[ns]') + expected_s[pd.NaT] = 1 + tm.assert_series_equal(result, expected_s) + + unique = s.unique() + self.assertEqual(unique.dtype, 'datetime64[ns]') + # numpy_array_equal cannot compare pd.NaT + self.assert_numpy_array_equal(unique[:3], expected) + self.assertTrue(unique[3] is pd.NaT or unique[3].astype('int64') == pd.tslib.iNaT) + + self.assertEquals(s.nunique(), 4) + + # timedelta64[ns] + td = df.dt - df.dt + timedelta(1) + td = klass(td) + + result = td.value_counts() + expected_s = Series([6], index=[86400000000000]) + self.assertEqual(result.index.dtype, 'int64') + tm.assert_series_equal(result, expected_s) + + # get nanoseconds to compare + expected = np.array([86400000000000]) + self.assert_numpy_array_equal(td.unique(), expected) + self.assertEquals(td.nunique(), 1) + + td2 = timedelta(1) + (df.dt - df.dt) + td2 = klass(td2) + result2 = td2.value_counts() + + self.assertEqual(result2.index.dtype, 'int64') + tm.assert_series_equal(result2, expected_s) + + self.assert_numpy_array_equal(td.unique(), expected) + self.assertEquals(td.nunique(), 1) + class TestDatetimeIndexOps(Ops): _allowed = '_allow_datetime_index_ops' diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 143e47baab465..8680446241659 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -3805,84 +3805,7 @@ def test_dot(self): self.assertRaises(ValueError, a.dot, b.T) def test_value_counts_nunique(self): - s = Series(['a', 'b', 'b', 'b', 'b', 'a', 'c', 'd', 'd', 'a']) - hist = s.value_counts() - expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) - assert_series_equal(hist, expected) - - # don't sort, have to sort after the fact as not sorting is platform-dep - hist = s.value_counts(sort=False) - hist.sort() - expected = Series([3, 1, 4, 2], index=list('acbd')) - expected.sort() - assert_series_equal(hist, expected) - - # sort ascending - hist = s.value_counts(ascending=True) - expected = Series([1, 2, 3, 4], index=list('cdab')) - assert_series_equal(hist, expected) - - # relative histogram. - hist = s.value_counts(normalize=True) - expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c']) - assert_series_equal(hist, expected) - - self.assertEquals(s.nunique(), 4) - - # bins - self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1) - - s1 = Series([1, 1, 2, 3]) - res1 = s1.value_counts(bins=1) - exp1 = Series({0.998: 4}) - assert_series_equal(res1, exp1) - res1n = s1.value_counts(bins=1, normalize=True) - exp1n = Series({0.998: 1.0}) - assert_series_equal(res1n, exp1n) - - res4 = s1.value_counts(bins=4) - exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0]) - assert_series_equal(res4, exp4) - res4n = s1.value_counts(bins=4, normalize=True) - exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0]) - assert_series_equal(res4n, exp4n) - - # handle NA's properly - s[5:7] = np.nan - hist = s.value_counts() - expected = s.dropna().value_counts() - assert_series_equal(hist, expected) - - s = Series({}) - hist = s.value_counts() - expected = Series([], dtype=np.int64) - assert_series_equal(hist, expected) - - # GH 3002, datetime64[ns] - import pandas as pd - f = StringIO( - "xxyyzz20100101PIE\nxxyyzz20100101GUM\nxxyyww20090101EGG\nfoofoo20080909PIE") - df = pd.read_fwf(f, widths=[6, 8, 3], names=[ - "person_id", "dt", "food"], parse_dates=["dt"]) - s = df.dt.copy() - result = s.value_counts() - self.assertEqual(result.index.dtype, 'datetime64[ns]') - - # with NaT - s = s.append(Series({4: pd.NaT})) - result = s.value_counts() - self.assertEqual(result.index.dtype, 'datetime64[ns]') - - # timedelta64[ns] - from datetime import timedelta - td = df.dt - df.dt + timedelta(1) - td2 = timedelta(1) + (df.dt - df.dt) - result = td.value_counts() - result2 = td2.value_counts() - #self.assertEqual(result.index.dtype, 'timedelta64[ns]') - self.assertEqual(result.index.dtype, 'int64') - self.assertEqual(result2.index.dtype, 'int64') - + # basics.rst doc example series = Series(np.random.randn(500)) series[20:500] = np.nan @@ -3909,25 +3832,7 @@ def test_unique(self): result = s.unique() self.assertEqual(len(result), 2) - # integers - s = Series(np.random.randint(0, 100, size=100)) - result = np.sort(s.unique()) - expected = np.unique(s.values) - self.assert_numpy_array_equal(result, expected) - - s = Series(np.random.randint(0, 100, size=100).astype(np.int32)) - result = np.sort(s.unique()) - expected = np.unique(s.values) - self.assert_numpy_array_equal(result, expected) - - # test string arrays for coverage - strings = np.tile(np.array([tm.rands(10) for _ in range(10)]), 10) - result = np.sort(nanops.unique1d(strings)) - expected = np.unique(strings) - self.assert_numpy_array_equal(result, expected) - # decision about None - s = Series([1, 2, 3, None, None, None], dtype=object) result = s.unique() expected = np.array([1, 2, 3, None], dtype=object) diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 4a4fbb146861d..9dc26f2b01ccc 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -1487,6 +1487,17 @@ def test_index_duplicate_periods(self): expected = ts[idx == 2007] assert_series_equal(result, expected) + def test_index_unique(self): + idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq='A-JUN') + expected = PeriodIndex([2000, 2007, 2009], freq='A-JUN') + self.assert_numpy_array_equal(idx.unique(), expected.values) + self.assertEqual(idx.nunique(), 3) + + idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq='A-JUN', tz='US/Eastern') + expected = PeriodIndex([2000, 2007, 2009], freq='A-JUN', tz='US/Eastern') + self.assert_numpy_array_equal(idx.unique(), expected.values) + self.assertEqual(idx.nunique(), 3) + def test_constructor(self): pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') assert_equal(len(pi), 9) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 64da6f76f3697..f7edd92fce122 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -77,7 +77,11 @@ def test_is_unique_monotonic(self): def test_index_unique(self): uniques = self.dups.index.unique() + expected = DatetimeIndex([datetime(2000, 1, 2), datetime(2000, 1, 3), + datetime(2000, 1, 4), datetime(2000, 1, 5)]) self.assertEqual(uniques.dtype, 'M8[ns]') # sanity + self.assert_(uniques.equals(expected)) + self.assertEqual(self.dups.index.nunique(), 4) # #2563 self.assertTrue(isinstance(uniques, DatetimeIndex)) @@ -85,8 +89,21 @@ def test_index_unique(self): dups_local = self.dups.index.tz_localize('US/Eastern') dups_local.name = 'foo' result = dups_local.unique() + expected = DatetimeIndex(expected, tz='US/Eastern') self.assertTrue(result.tz is not None) self.assertEquals(result.name, 'foo') + self.assert_(result.equals(expected)) + + # NaT + arr = [ 1370745748 + t for t in range(20) ] + [iNaT] + idx = DatetimeIndex(arr * 3) + self.assert_(idx.unique().equals(DatetimeIndex(arr))) + self.assertEqual(idx.nunique(), 21) + + arr = [ Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20) ] + [NaT] + idx = DatetimeIndex(arr * 3) + self.assert_(idx.unique().equals(DatetimeIndex(arr))) + self.assertEqual(idx.nunique(), 21) def test_index_dupes_contains(self): d = datetime(2011, 12, 5, 20, 30)