From 1d9bc57d8b90cb0e0a026d60cb1c3a0a84e70a7d Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Sun, 17 Nov 2013 02:08:44 -0800 Subject: [PATCH 1/3] ENH nlargest and nsmallest Series methods --- doc/source/v0.13.1.txt | 1 + pandas/algos.pyx | 60 +++++++++++++++++++---------- pandas/core/series.py | 77 ++++++++++++++++++++++++++++++------- pandas/tests/test_series.py | 33 ++++++++++++++++ pandas/tools/util.py | 70 ++++++++++++++++++++++++++++++++- vb_suite/series_methods.py | 29 ++++++++++++++ 6 files changed, 234 insertions(+), 36 deletions(-) create mode 100644 vb_suite/series_methods.py diff --git a/doc/source/v0.13.1.txt b/doc/source/v0.13.1.txt index b48f555f9691a..557abfc48a023 100644 --- a/doc/source/v0.13.1.txt +++ b/doc/source/v0.13.1.txt @@ -128,6 +128,7 @@ API changes import pandas.core.common as com com.array_equivalent(np.array([0, np.nan]), np.array([0, np.nan])) np.array_equal(np.array([0, np.nan]), np.array([0, np.nan])) +- Add nsmallest and nlargest Series methods (:issue:`3960`) - ``DataFrame.apply`` will use the ``reduce`` argument to determine whether a ``Series`` or a ``DataFrame`` should be returned when the ``DataFrame`` is diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 3b527740505e4..19a80c986af8e 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -21,6 +21,9 @@ from numpy cimport NPY_FLOAT16 as NPY_float16 from numpy cimport NPY_FLOAT32 as NPY_float32 from numpy cimport NPY_FLOAT64 as NPY_float64 +from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, + uint32_t, uint64_t, float16_t, float32_t, float64_t) + int8 = np.dtype(np.int8) int16 = np.dtype(np.int16) int32 = np.dtype(np.int32) @@ -736,16 +739,34 @@ def _check_minp(win, minp, N): # Physical description: 366 p. # Series: Prentice-Hall Series in Automatic Computation -def kth_smallest(ndarray[double_t] a, Py_ssize_t k): - cdef: - Py_ssize_t i,j,l,m,n - double_t x, t +ctypedef fused kth_type: + int8_t + int16_t + int32_t + int64_t - n = len(a) + uint8_t + uint16_t + uint32_t + uint64_t - l = 0 - m = n-1 - while (l j: break @@ -765,6 +784,7 @@ def kth_smallest(ndarray[double_t] a, Py_ssize_t k): if k < i: m = j return a[k] + cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n): cdef: Py_ssize_t i,j,l,m @@ -781,9 +801,7 @@ cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n): while a[i] < x: i += 1 while x < a[j]: j -= 1 if i <= j: - t = a[i] - a[i] = a[j] - a[j] = t + swap_kth(&a[i], &a[j]) i += 1; j -= 1 if i > j: break @@ -793,22 +811,22 @@ cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n): return a[k] -def median(ndarray arr): +cpdef kth_type median(kth_type[:] arr): ''' A faster median ''' - cdef int n = len(arr) + cdef Py_ssize_t n = arr.size - if len(arr) == 0: + if n == 0: return np.NaN arr = arr.copy() if n % 2: - return kth_smallest(arr, n / 2) + return kth_smallest(arr, n // 2) else: - return (kth_smallest(arr, n / 2) + - kth_smallest(arr, n / 2 - 1)) / 2 + return (kth_smallest(arr, n // 2) + + kth_smallest(arr, n // 2 - 1)) / 2 # -------------- Min, Max subsequence @@ -2226,7 +2244,7 @@ cdef inline float64_t _median_linear(float64_t* a, int n): if n % 2: - result = kth_smallest_c(a, n / 2, n) + result = kth_smallest_c( a, n / 2, n) else: result = (kth_smallest_c(a, n / 2, n) + kth_smallest_c(a, n / 2 - 1, n)) / 2 diff --git a/pandas/core/series.py b/pandas/core/series.py index d95f8da8097e9..0151c55f91430 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -6,7 +6,6 @@ # pylint: disable=E1101,E1103 # pylint: disable=W0703,W0622,W0613,W0201 -import operator import types import warnings @@ -15,21 +14,16 @@ import numpy.ma as ma from pandas.core.common import (isnull, notnull, _is_bool_indexer, - _default_index, _maybe_promote, _maybe_upcast, - _asarray_tuplesafe, is_integer_dtype, - _NS_DTYPE, _TD_DTYPE, - _infer_dtype_from_scalar, is_list_like, - _values_from_object, + _default_index, _maybe_upcast, + _asarray_tuplesafe, _infer_dtype_from_scalar, + is_list_like, _values_from_object, _possibly_cast_to_datetime, _possibly_castable, - _possibly_convert_platform, - _try_sort, + _possibly_convert_platform, _try_sort, ABCSparseArray, _maybe_match_name, _ensure_object, SettingWithCopyError) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, _ensure_index) -from pandas.core.indexing import ( - _check_bool_indexer, - _is_index_slice, _maybe_convert_indices) +from pandas.core.indexing import _check_bool_indexer, _maybe_convert_indices from pandas.core import generic, base from pandas.core.internals import SingleBlockManager from pandas.core.categorical import Categorical @@ -37,7 +31,7 @@ from pandas.tseries.period import PeriodIndex, Period from pandas import compat from pandas.util.terminal import get_terminal_size -from pandas.compat import zip, lzip, u, OrderedDict +from pandas.compat import zip, u, OrderedDict import pandas.core.array as pa import pandas.core.ops as ops @@ -46,7 +40,7 @@ import pandas.core.datetools as datetools import pandas.core.format as fmt import pandas.core.nanops as nanops -from pandas.util.decorators import Appender, Substitution, cache_readonly +from pandas.util.decorators import Appender, cache_readonly import pandas.lib as lib import pandas.tslib as tslib @@ -1705,7 +1699,17 @@ def _try_kind_sort(arr): good = ~bad idx = pa.arange(len(self)) - argsorted = _try_kind_sort(arr[good]) + def _try_kind_sort(arr, kind='mergesort'): + # easier to ask forgiveness than permission + try: + # if kind==mergesort, it can fail for object dtype + return arr.argsort(kind=kind) + except TypeError: + # stable sort not available for object dtype + # uses the argsort default quicksort + return arr.argsort(kind='quicksort') + + argsorted = _try_kind_sort(arr[good], kind=kind) if not ascending: argsorted = argsorted[::-1] @@ -1728,6 +1732,51 @@ def _try_kind_sort(arr): else: return result.__finalize__(self) + def nlargest(self, n=5, take_last=False): + ''' + Returns the largest n rows: + + May be faster than .order(ascending=False).head(n). + + ''' + # TODO remove need for dropna ? + dropped = self.dropna() + + from pandas.tools.util import nlargest + + if dropped.dtype == object: + try: + dropped = dropped.astype(float) + except (NotImplementedError, TypeError): + return dropped.order(ascending=False).head(n) + + inds = nlargest(dropped.values, n, take_last) + if len(inds) == 0: + # TODO remove this special case + return dropped[[]] + return dropped.iloc[inds] + + def nsmallest(self, n=5, take_last=False): + ''' + Returns the smallest n rows. + + May be faster than .order().head(n). + + ''' + # TODO remove need for dropna ? + dropped = self.dropna() + + from pandas.tools.util import nsmallest + try: + inds = nsmallest(dropped.values, n, take_last) + except NotImplementedError: + return dropped.order().head(n) + + if len(inds) == 0: + # TODO remove this special case + return dropped[[]] + return dropped.iloc[inds] + def sortlevel(self, level=0, ascending=True, sort_remaining=True): """ Sort Series with MultiIndex by chosen level. Data will be diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 6e7c9edfc4025..e13a827cc8d00 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -3998,6 +3998,39 @@ def test_order(self): ordered = ts.order(ascending=False, na_position='first') assert_almost_equal(expected, ordered.valid().values) + def test_nsmallest_nlargest(self): + # float, int, datetime64 (use i8), timedelts64 (same), + # object that are numbers, object that are strings + + s_list = [Series([3, 2, 1, 2, 5]), + Series([3., 2., 1., 2., 5.]), + Series([3., 2, 1, 2, 5], dtype='object'), + Series([3., 2, 1, 2, '5'], dtype='object'), + Series(pd.to_datetime(['2003', '2002', '2001', '2002', '2005']))] + + for s in s_list: + + assert_series_equal(s.nsmallest(2), s.iloc[[2, 1]]) + assert_series_equal(s.nsmallest(2, take_last=True), s.iloc[[2, 3]]) + + assert_series_equal(s.nlargest(3), s.iloc[[4, 0, 1]]) + assert_series_equal(s.nlargest(3, take_last=True), s.iloc[[4, 0, 3]]) + + empty = s.iloc[0:0] + assert_series_equal(s.nsmallest(0), empty) + assert_series_equal(s.nsmallest(-1), empty) + assert_series_equal(s.nlargest(0), empty) + assert_series_equal(s.nlargest(-1), empty) + + assert_series_equal(s.nsmallest(len(s)), s.order()) + assert_series_equal(s.nsmallest(len(s) + 1), s.order()) + assert_series_equal(s.nlargest(len(s)), s.iloc[[4, 0, 1, 3, 2]]) + assert_series_equal(s.nlargest(len(s) + 1), s.iloc[[4, 0, 1, 3, 2]]) + + s = Series([3., np.nan, 1, 2, 5]) + assert_series_equal(s.nlargest(), s.iloc[[4, 0, 3, 2]]) + assert_series_equal(s.nsmallest(), s.iloc[[2, 3, 0, 4]]) + def test_rank(self): from pandas.compat.scipy import rankdata diff --git a/pandas/tools/util.py b/pandas/tools/util.py index 6dbefc4b70930..6a8cd0e358f87 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -1,6 +1,9 @@ from pandas.compat import reduce from pandas.core.index import Index import numpy as np +from pandas import algos +import pandas.core.common as com + def match(needles, haystack): haystack = Index(haystack) @@ -17,7 +20,7 @@ def cartesian_product(X): -------- >>> cartesian_product([list('ABC'), [1, 2]]) [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'), - array([1, 2, 1, 2, 1, 2])] + array([1, 2, 1, 2, 1, 2])] ''' @@ -43,3 +46,68 @@ def compose(*funcs): """Compose 2 or more callables""" assert len(funcs) > 1, 'At least 2 callables must be passed to compose' return reduce(_compose2, funcs) + + +_dtype_map = {'datetime64[ns]': 'int64', 'int64': 'int64', + 'float64': 'float64'} + + +def nsmallest(arr, n=5, take_last=False): + ''' + Find the indices of the n smallest values of a numpy array. + + Note: Fails silently with NaN. + + ''' + if n <= 0: + return np.array([]) # empty + elif n >= len(arr): + n = len(arr) + + if arr.dtype == object: + # just sort and take n + return arr.argsort(kind='mergesort')[:n] + + try: + dtype = _dtype_map[str(arr.dtype)] + except KeyError: + raise NotImplementedError("Not implemented for %s dtype, " + "perhaps convert to int64 or float64, " + "or use .order().head(n)") % arr.dtype + + arr = arr.view(dtype) + + if take_last: + arr = arr[::-1] + + kth_val = algos.kth_smallest(arr.copy(), n - 1) + + ns, = np.nonzero(arr <= kth_val) + inds = ns[arr[ns].argsort(kind='mergesort')][:n] + + if take_last: + # reverse indices + return len(arr) - 1 - inds + return inds + + +def nlargest(arr, n=5, take_last=False): + ''' + Find the indices of the n largest values of a numpy array. + + Note: Fails silently with NaN. + + ''' + if n <= 0: + return np.array([]) # empty + + n = min(n, len(arr)) + + if arr.dtype == object: + try: + arr = arr.astype(float) + except: + raise TypeError("An object array must convert to float.") + + arr = -arr.view(_dtype_map[str(arr.dtype)]) + return nsmallest(arr, n, take_last=take_last) diff --git a/vb_suite/series_methods.py b/vb_suite/series_methods.py new file mode 100644 index 0000000000000..1659340cfe050 --- /dev/null +++ b/vb_suite/series_methods.py @@ -0,0 +1,29 @@ +from vbench.api import Benchmark +from datetime import datetime + +common_setup = """from pandas_vb_common import * +""" + +setup = common_setup + """ +s1 = Series(np.random.randn(10000)) +s2 = Series(np.random.randint(1, 10, 10000)) +""" + +series_nlargest1 = Benchmark('s1.nlargest(3, take_last=True);' + 's1.nlargest(3, take_last=False)', + setup, + start_date=datetime(2014, 1, 25)) +series_nlargest2 = Benchmark('s2.nlargest(3, take_last=True);' + 's2.nlargest(3, take_last=False)', + setup, + start_date=datetime(2014, 1, 25)) + +series_nsmallest2 = Benchmark('s1.nsmallest(3, take_last=True);' + 's1.nsmallest(3, take_last=False)', + setup, + start_date=datetime(2014, 1, 25)) + +series_nsmallest2 = Benchmark('s2.nsmallest(3, take_last=True);' + 's2.nsmallest(3, take_last=False)', + setup, + start_date=datetime(2014, 1, 25)) From a909e1649bb947736c3bcac8829af3ba3532a4a3 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 12 May 2014 09:21:13 -0400 Subject: [PATCH 2/3] DOC/REF: add docstrings and DRY it up --- doc/source/v0.13.1.txt | 1 - doc/source/v0.14.0.txt | 1 + pandas/algos.pyx | 27 ++++++---- pandas/core/algorithms.py | 88 +++++++++++++++++++++++++++++-- pandas/core/series.py | 100 ++++++++++++++++++++---------------- pandas/tests/test_series.py | 43 +++++++++++++--- pandas/tools/util.py | 67 +----------------------- 7 files changed, 197 insertions(+), 130 deletions(-) diff --git a/doc/source/v0.13.1.txt b/doc/source/v0.13.1.txt index 557abfc48a023..b48f555f9691a 100644 --- a/doc/source/v0.13.1.txt +++ b/doc/source/v0.13.1.txt @@ -128,7 +128,6 @@ API changes import pandas.core.common as com com.array_equivalent(np.array([0, np.nan]), np.array([0, np.nan])) np.array_equal(np.array([0, np.nan]), np.array([0, np.nan])) -- Add nsmallest and nlargest Series methods (:issue:`3960`) - ``DataFrame.apply`` will use the ``reduce`` argument to determine whether a ``Series`` or a ``DataFrame`` should be returned when the ``DataFrame`` is diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 8182bff92fb63..c033debbb6808 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -643,6 +643,7 @@ Enhancements values='Quantity', aggfunc=np.sum) - str.wrap implemented (:issue:`6999`) +- Add nsmallest and nlargest Series methods (:issue:`3960`) - `PeriodIndex` fully supports partial string indexing like `DatetimeIndex` (:issue:`7043`) diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 19a80c986af8e..431ef97debae6 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -739,7 +739,8 @@ def _check_minp(win, minp, N): # Physical description: 366 p. # Series: Prentice-Hall Series in Automatic Computation -ctypedef fused kth_type: + +ctypedef fused numeric: int8_t int16_t int32_t @@ -754,17 +755,25 @@ ctypedef fused kth_type: float64_t -cdef void swap_kth(kth_type *a, kth_type *b): - cdef kth_type t +cdef inline Py_ssize_t swap(numeric *a, numeric *b) except -1: + cdef numeric t + + # cython doesn't allow pointer dereference so use array syntax t = a[0] a[0] = b[0] b[0] = t + return 0 -cpdef kth_type kth_smallest(kth_type[:] a, Py_ssize_t k): +@cython.boundscheck(False) +@cython.wraparound(False) +cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k): cdef: - Py_ssize_t i, j, l = 0, n = a.size, m = n - 1 - kth_type x + Py_ssize_t i, j, l, m, n = a.size + numeric x + + l = 0 + m = n - 1 while l < m: x = a[k] @@ -775,7 +784,7 @@ cpdef kth_type kth_smallest(kth_type[:] a, Py_ssize_t k): while a[i] < x: i += 1 while x < a[j]: j -= 1 if i <= j: - swap_kth(&a[i], &a[j]) + swap(&a[i], &a[j]) i += 1; j -= 1 if i > j: break @@ -801,7 +810,7 @@ cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n): while a[i] < x: i += 1 while x < a[j]: j -= 1 if i <= j: - swap_kth(&a[i], &a[j]) + swap(&a[i], &a[j]) i += 1; j -= 1 if i > j: break @@ -811,7 +820,7 @@ cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n): return a[k] -cpdef kth_type median(kth_type[:] arr): +cpdef numeric median(numeric[:] arr): ''' A faster median ''' diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 002d5480b9b7b..954f18ccb69b8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -9,9 +9,7 @@ import pandas.core.common as com import pandas.algos as algos import pandas.hashtable as htable -import pandas.compat as compat -from pandas.compat import filter, string_types -from pandas.util.decorators import deprecate_kwarg +from pandas.compat import string_types def match(to_match, values, na_sentinel=-1): """ @@ -413,6 +411,90 @@ def group_position(*args): return result +_dtype_map = {'datetime64[ns]': 'int64', 'timedelta64[ns]': 'int64'} + + +def _finalize_nsmallest(arr, kth_val, n, take_last, narr): + ns, = np.nonzero(arr <= kth_val) + inds = ns[arr[ns].argsort(kind='mergesort')][:n] + + if take_last: + # reverse indices + return narr - 1 - inds + return inds + + +def nsmallest(arr, n, take_last=False): + ''' + Find the indices of the n smallest values of a numpy array. + + Note: Fails silently with NaN. + + ''' + if take_last: + arr = arr[::-1] + + narr = len(arr) + n = min(n, narr) + + sdtype = str(arr.dtype) + arr = arr.view(_dtype_map.get(sdtype, sdtype)) + + kth_val = algos.kth_smallest(arr.copy(), n - 1) + return _finalize_nsmallest(arr, kth_val, n, take_last, narr) + + +def nlargest(arr, n, take_last=False): + """ + Find the indices of the n largest values of a numpy array. + + Note: Fails silently with NaN. + """ + sdtype = str(arr.dtype) + arr = arr.view(_dtype_map.get(sdtype, sdtype)) + return nsmallest(-arr, n, take_last=take_last) + + +def select_n_slow(dropped, n, take_last, method): + reverse_it = take_last or method == 'nlargest' + ascending = method == 'nsmallest' + slc = np.s_[::-1] if reverse_it else np.s_[:] + return dropped[slc].order(ascending=ascending).head(n) + + +_select_methods = {'nsmallest': nsmallest, 'nlargest': nlargest} + + +def select_n(series, n, take_last, method): + """Implement n largest/smallest. + + Parameters + ---------- + n : int + take_last : bool + method : str, {'nlargest', 'nsmallest'} + + Returns + ------- + nordered : Series + """ + dtype = series.dtype + if not issubclass(dtype.type, (np.integer, np.floating, np.datetime64, + np.timedelta64)): + raise TypeError("Cannot use method %r with dtype %s" % (method, dtype)) + + if n <= 0: + return series[[]] + + dropped = series.dropna() + + if n >= len(series): + return select_n_slow(dropped, n, take_last, method) + + inds = _select_methods[method](dropped.values, n, take_last) + return dropped.iloc[inds] + + _rank1d_functions = { 'float64': algos.rank_1d_float64, 'int64': algos.rank_1d_int64, diff --git a/pandas/core/series.py b/pandas/core/series.py index 0151c55f91430..1637ba49d86a2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -35,6 +35,7 @@ import pandas.core.array as pa import pandas.core.ops as ops +from pandas.core.algorithms import select_n import pandas.core.common as com import pandas.core.datetools as datetools @@ -1699,17 +1700,7 @@ def _try_kind_sort(arr): good = ~bad idx = pa.arange(len(self)) - def _try_kind_sort(arr, kind='mergesort'): - # easier to ask forgiveness than permission - try: - # if kind==mergesort, it can fail for object dtype - return arr.argsort(kind=kind) - except TypeError: - # stable sort not available for object dtype - # uses the argsort default quicksort - return arr.argsort(kind='quicksort') - - argsorted = _try_kind_sort(arr[good], kind=kind) + argsorted = _try_kind_sort(arr[good]) if not ascending: argsorted = argsorted[::-1] @@ -1733,49 +1724,70 @@ def _try_kind_sort(arr, kind='mergesort'): return result.__finalize__(self) def nlargest(self, n=5, take_last=False): - ''' - Returns the largest n rows: + """Return the largest `n` elements. - May be faster than .order(ascending=False).head(n). + Parameters + ---------- + n : int + Return this many descending sorted values + take_last : bool + Where there are duplicate values, take the last duplicate - ''' - # TODO remove need for dropna ? - dropped = self.dropna() + Returns + ------- + top_n : Series + The n largest values in the Series, in sorted order - from pandas.tools.util import nlargest + Notes + ----- + Faster than ``.order(ascending=False).head(n)`` for small `n` relative + to the size of the ``Series`` object. - if dropped.dtype == object: - try: - dropped = dropped.astype(float) - except (NotImplementedError, TypeError): - return dropped.order(ascending=False).head(n) + See Also + -------- + Series.nsmallest - inds = nlargest(dropped.values, n, take_last) - if len(inds) == 0: - # TODO remove this special case - return dropped[[]] - return dropped.iloc[inds] + Examples + -------- + >>> import pandas as pd + >>> import numpy as np + >>> s = pd.Series(np.random.randn(1e6)) + >>> s.nlargest(10) # only sorts up to the N requested + """ + return select_n(self, n=n, take_last=take_last, method='nlargest') def nsmallest(self, n=5, take_last=False): - ''' - Returns the smallest n rows. + """Return the smallest `n` elements. - May be faster than .order().head(n). + Parameters + ---------- + n : int + Return this many ascending sorted values + take_last : bool + Where there are duplicate values, take the last duplicate - ''' - # TODO remove need for dropna ? - dropped = self.dropna() + Returns + ------- + bottom_n : Series + The n smallest values in the Series, in sorted order - from pandas.tools.util import nsmallest - try: - inds = nsmallest(dropped.values, n, take_last) - except NotImplementedError: - return dropped.order().head(n) - - if len(inds) == 0: - # TODO remove this special case - return dropped[[]] - return dropped.iloc[inds] + Notes + ----- + Faster than ``.order().head(n)`` for small `n` relative to + the size of the ``Series`` object. + + See Also + -------- + Series.nlargest + + Examples + -------- + >>> import pandas as pd + >>> import numpy as np + >>> s = pd.Series(np.random.randn(1e6)) + >>> s.nsmallest(10) # only sorts up to the N requested + """ + return select_n(self, n=n, take_last=take_last, method='nsmallest') def sortlevel(self, level=0, ascending=True, sort_remaining=True): """ diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index e13a827cc8d00..434c21bfa76de 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -4002,11 +4002,38 @@ def test_nsmallest_nlargest(self): # float, int, datetime64 (use i8), timedelts64 (same), # object that are numbers, object that are strings - s_list = [Series([3, 2, 1, 2, 5]), - Series([3., 2., 1., 2., 5.]), - Series([3., 2, 1, 2, 5], dtype='object'), - Series([3., 2, 1, 2, '5'], dtype='object'), - Series(pd.to_datetime(['2003', '2002', '2001', '2002', '2005']))] + base = [3, 2, 1, 2, 5] + + s_list = [ + Series(base, dtype='int8'), + Series(base, dtype='int16'), + Series(base, dtype='int32'), + Series(base, dtype='int64'), + Series(base, dtype='float32'), + Series(base, dtype='float64'), + Series(base, dtype='uint8'), + Series(base, dtype='uint16'), + Series(base, dtype='uint32'), + Series(base, dtype='uint64'), + Series(base).astype('timedelta64[ns]'), + Series(pd.to_datetime(['2003', '2002', '2001', '2002', '2005'])), + ] + + raising = [ + Series([3., 2, 1, 2, '5'], dtype='object'), + Series([3., 2, 1, 2, 5], dtype='object'), + Series([3., 2, 1, 2, 5], dtype='complex256'), + Series([3., 2, 1, 2, 5], dtype='complex128'), + ] + + for r in raising: + dt = r.dtype + msg = "Cannot use method 'n(larg|small)est' with dtype %s" % dt + args = 2, len(r), 0, -1 + methods = r.nlargest, r.nsmallest + for method, arg in product(methods, args): + with tm.assertRaisesRegexp(TypeError, msg): + method(arg) for s in s_list: @@ -4014,7 +4041,8 @@ def test_nsmallest_nlargest(self): assert_series_equal(s.nsmallest(2, take_last=True), s.iloc[[2, 3]]) assert_series_equal(s.nlargest(3), s.iloc[[4, 0, 1]]) - assert_series_equal(s.nlargest(3, take_last=True), s.iloc[[4, 0, 3]]) + assert_series_equal(s.nlargest(3, take_last=True), + s.iloc[[4, 0, 3]]) empty = s.iloc[0:0] assert_series_equal(s.nsmallest(0), empty) @@ -4025,7 +4053,8 @@ def test_nsmallest_nlargest(self): assert_series_equal(s.nsmallest(len(s)), s.order()) assert_series_equal(s.nsmallest(len(s) + 1), s.order()) assert_series_equal(s.nlargest(len(s)), s.iloc[[4, 0, 1, 3, 2]]) - assert_series_equal(s.nlargest(len(s) + 1), s.iloc[[4, 0, 1, 3, 2]]) + assert_series_equal(s.nlargest(len(s) + 1), + s.iloc[[4, 0, 1, 3, 2]]) s = Series([3., np.nan, 1, 2, 5]) assert_series_equal(s.nlargest(), s.iloc[[4, 0, 3, 2]]) diff --git a/pandas/tools/util.py b/pandas/tools/util.py index 6a8cd0e358f87..1d6ed3e11c81e 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -1,8 +1,8 @@ +import operator from pandas.compat import reduce from pandas.core.index import Index import numpy as np from pandas import algos -import pandas.core.common as com def match(needles, haystack): @@ -46,68 +46,3 @@ def compose(*funcs): """Compose 2 or more callables""" assert len(funcs) > 1, 'At least 2 callables must be passed to compose' return reduce(_compose2, funcs) - - -_dtype_map = {'datetime64[ns]': 'int64', 'int64': 'int64', - 'float64': 'float64'} - - -def nsmallest(arr, n=5, take_last=False): - ''' - Find the indices of the n smallest values of a numpy array. - - Note: Fails silently with NaN. - - ''' - if n <= 0: - return np.array([]) # empty - elif n >= len(arr): - n = len(arr) - - if arr.dtype == object: - # just sort and take n - return arr.argsort(kind='mergesort')[:n] - - try: - dtype = _dtype_map[str(arr.dtype)] - except KeyError: - raise NotImplementedError("Not implemented for %s dtype, " - "perhaps convert to int64 or float64, " - "or use .order().head(n)") % arr.dtype - - arr = arr.view(dtype) - - if take_last: - arr = arr[::-1] - - kth_val = algos.kth_smallest(arr.copy(), n - 1) - - ns, = np.nonzero(arr <= kth_val) - inds = ns[arr[ns].argsort(kind='mergesort')][:n] - - if take_last: - # reverse indices - return len(arr) - 1 - inds - return inds - - -def nlargest(arr, n=5, take_last=False): - ''' - Find the indices of the n largest values of a numpy array. - - Note: Fails silently with NaN. - - ''' - if n <= 0: - return np.array([]) # empty - - n = min(n, len(arr)) - - if arr.dtype == object: - try: - arr = arr.astype(float) - except: - raise TypeError("An object array must convert to float.") - - arr = -arr.view(_dtype_map[str(arr.dtype)]) - return nsmallest(arr, n, take_last=take_last) From 66737055d3fdc4e8cc13505b7ac9f565aae69765 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Wed, 14 May 2014 16:27:55 -0400 Subject: [PATCH 3/3] DOC: doc blurb in basics.rst --- doc/source/basics.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index ca9751569336c..5aa84f46debea 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1311,6 +1311,21 @@ Some other sorting notes / nuances: compatibility with NumPy methods which expect the ``ndarray.sort`` behavior. +.. versionadded:: 0.14.0 + +``Series`` has the ``nsmallest`` and ``nlargest`` methods which return the +smallest or largest :math:`n` values. For a large ``Series`` this can be much +faster than sorting the entire Series and calling ``head(n)`` on the result. + +.. ipython:: python + + s = Series(np.random.permutation(10)) + s + s.order() + s.nsmallest(3) + s.nlargest(3) + + Sorting by a multi-index column ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~