Skip to content

ENH: add nlargest nsmallest to Series #7113

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 14, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions doc/source/basics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1311,6 +1311,21 @@ Some other sorting notes / nuances:
compatibility with NumPy methods which expect the ``ndarray.sort``
behavior.

.. versionadded:: 0.14.0

``Series`` has the ``nsmallest`` and ``nlargest`` methods which return the
smallest or largest :math:`n` values. For a large ``Series`` this can be much
faster than sorting the entire Series and calling ``head(n)`` on the result.

.. ipython:: python

s = Series(np.random.permutation(10))
s
s.order()
s.nsmallest(3)
s.nlargest(3)


Sorting by a multi-index column
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
1 change: 1 addition & 0 deletions doc/source/v0.14.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,7 @@ Enhancements
values='Quantity', aggfunc=np.sum)

- str.wrap implemented (:issue:`6999`)
- Add nsmallest and nlargest Series methods (:issue:`3960`)

- `PeriodIndex` fully supports partial string indexing like `DatetimeIndex` (:issue:`7043`)

Expand Down
67 changes: 47 additions & 20 deletions pandas/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ from numpy cimport NPY_FLOAT16 as NPY_float16
from numpy cimport NPY_FLOAT32 as NPY_float32
from numpy cimport NPY_FLOAT64 as NPY_float64

from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
uint32_t, uint64_t, float16_t, float32_t, float64_t)

int8 = np.dtype(np.int8)
int16 = np.dtype(np.int16)
int32 = np.dtype(np.int32)
Expand Down Expand Up @@ -736,16 +739,43 @@ def _check_minp(win, minp, N):
# Physical description: 366 p.
# Series: Prentice-Hall Series in Automatic Computation

def kth_smallest(ndarray[double_t] a, Py_ssize_t k):
cdef:
Py_ssize_t i,j,l,m,n
double_t x, t

n = len(a)
ctypedef fused numeric:
int8_t
int16_t
int32_t
int64_t

uint8_t
uint16_t
uint32_t
uint64_t

float32_t
float64_t


cdef inline Py_ssize_t swap(numeric *a, numeric *b) except -1:
cdef numeric t

# cython doesn't allow pointer dereference so use array syntax
t = a[0]
a[0] = b[0]
b[0] = t
return 0


@cython.boundscheck(False)
@cython.wraparound(False)
cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k):
cdef:
Py_ssize_t i, j, l, m, n = a.size
numeric x

l = 0
m = n-1
while (l<m):
m = n - 1

while l < m:
x = a[k]
i = l
j = m
Expand All @@ -754,9 +784,7 @@ def kth_smallest(ndarray[double_t] a, Py_ssize_t k):
while a[i] < x: i += 1
while x < a[j]: j -= 1
if i <= j:
t = a[i]
a[i] = a[j]
a[j] = t
swap(&a[i], &a[j])
i += 1; j -= 1

if i > j: break
Expand All @@ -765,6 +793,7 @@ def kth_smallest(ndarray[double_t] a, Py_ssize_t k):
if k < i: m = j
return a[k]


cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n):
cdef:
Py_ssize_t i,j,l,m
Expand All @@ -781,9 +810,7 @@ cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n):
while a[i] < x: i += 1
while x < a[j]: j -= 1
if i <= j:
t = a[i]
a[i] = a[j]
a[j] = t
swap(&a[i], &a[j])
i += 1; j -= 1

if i > j: break
Expand All @@ -793,22 +820,22 @@ cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n):
return a[k]


def median(ndarray arr):
cpdef numeric median(numeric[:] arr):
'''
A faster median
'''
cdef int n = len(arr)
cdef Py_ssize_t n = arr.size

if len(arr) == 0:
if n == 0:
return np.NaN

arr = arr.copy()

if n % 2:
return kth_smallest(arr, n / 2)
return kth_smallest(arr, n // 2)
else:
return (kth_smallest(arr, n / 2) +
kth_smallest(arr, n / 2 - 1)) / 2
return (kth_smallest(arr, n // 2) +
kth_smallest(arr, n // 2 - 1)) / 2


# -------------- Min, Max subsequence
Expand Down Expand Up @@ -2226,7 +2253,7 @@ cdef inline float64_t _median_linear(float64_t* a, int n):


if n % 2:
result = kth_smallest_c(a, n / 2, n)
result = kth_smallest_c( a, n / 2, n)
else:
result = (kth_smallest_c(a, n / 2, n) +
kth_smallest_c(a, n / 2 - 1, n)) / 2
Expand Down
88 changes: 85 additions & 3 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@
import pandas.core.common as com
import pandas.algos as algos
import pandas.hashtable as htable
import pandas.compat as compat
from pandas.compat import filter, string_types
from pandas.util.decorators import deprecate_kwarg
from pandas.compat import string_types

def match(to_match, values, na_sentinel=-1):
"""
Expand Down Expand Up @@ -413,6 +411,90 @@ def group_position(*args):
return result


_dtype_map = {'datetime64[ns]': 'int64', 'timedelta64[ns]': 'int64'}


def _finalize_nsmallest(arr, kth_val, n, take_last, narr):
ns, = np.nonzero(arr <= kth_val)
inds = ns[arr[ns].argsort(kind='mergesort')][:n]

if take_last:
# reverse indices
return narr - 1 - inds
return inds


def nsmallest(arr, n, take_last=False):
'''
Find the indices of the n smallest values of a numpy array.

Note: Fails silently with NaN.

'''
if take_last:
arr = arr[::-1]

narr = len(arr)
n = min(n, narr)

sdtype = str(arr.dtype)
arr = arr.view(_dtype_map.get(sdtype, sdtype))

kth_val = algos.kth_smallest(arr.copy(), n - 1)
return _finalize_nsmallest(arr, kth_val, n, take_last, narr)


def nlargest(arr, n, take_last=False):
"""
Find the indices of the n largest values of a numpy array.

Note: Fails silently with NaN.
"""
sdtype = str(arr.dtype)
arr = arr.view(_dtype_map.get(sdtype, sdtype))
return nsmallest(-arr, n, take_last=take_last)


def select_n_slow(dropped, n, take_last, method):
reverse_it = take_last or method == 'nlargest'
ascending = method == 'nsmallest'
slc = np.s_[::-1] if reverse_it else np.s_[:]
return dropped[slc].order(ascending=ascending).head(n)


_select_methods = {'nsmallest': nsmallest, 'nlargest': nlargest}


def select_n(series, n, take_last, method):
"""Implement n largest/smallest.

Parameters
----------
n : int
take_last : bool
method : str, {'nlargest', 'nsmallest'}

Returns
-------
nordered : Series
"""
dtype = series.dtype
if not issubclass(dtype.type, (np.integer, np.floating, np.datetime64,
np.timedelta64)):
raise TypeError("Cannot use method %r with dtype %s" % (method, dtype))

if n <= 0:
return series[[]]

dropped = series.dropna()

if n >= len(series):
return select_n_slow(dropped, n, take_last, method)

inds = _select_methods[method](dropped.values, n, take_last)
return dropped.iloc[inds]


_rank1d_functions = {
'float64': algos.rank_1d_float64,
'int64': algos.rank_1d_int64,
Expand Down
87 changes: 74 additions & 13 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
# pylint: disable=E1101,E1103
# pylint: disable=W0703,W0622,W0613,W0201

import operator
import types
import warnings

Expand All @@ -15,38 +14,34 @@
import numpy.ma as ma

from pandas.core.common import (isnull, notnull, _is_bool_indexer,
_default_index, _maybe_promote, _maybe_upcast,
_asarray_tuplesafe, is_integer_dtype,
_NS_DTYPE, _TD_DTYPE,
_infer_dtype_from_scalar, is_list_like,
_values_from_object,
_default_index, _maybe_upcast,
_asarray_tuplesafe, _infer_dtype_from_scalar,
is_list_like, _values_from_object,
_possibly_cast_to_datetime, _possibly_castable,
_possibly_convert_platform,
_try_sort,
_possibly_convert_platform, _try_sort,
ABCSparseArray, _maybe_match_name,
_ensure_object, SettingWithCopyError)
from pandas.core.index import (Index, MultiIndex, InvalidIndexError,
_ensure_index)
from pandas.core.indexing import (
_check_bool_indexer,
_is_index_slice, _maybe_convert_indices)
from pandas.core.indexing import _check_bool_indexer, _maybe_convert_indices
from pandas.core import generic, base
from pandas.core.internals import SingleBlockManager
from pandas.core.categorical import Categorical
from pandas.tseries.index import DatetimeIndex
from pandas.tseries.period import PeriodIndex, Period
from pandas import compat
from pandas.util.terminal import get_terminal_size
from pandas.compat import zip, lzip, u, OrderedDict
from pandas.compat import zip, u, OrderedDict

import pandas.core.array as pa
import pandas.core.ops as ops
from pandas.core.algorithms import select_n

import pandas.core.common as com
import pandas.core.datetools as datetools
import pandas.core.format as fmt
import pandas.core.nanops as nanops
from pandas.util.decorators import Appender, Substitution, cache_readonly
from pandas.util.decorators import Appender, cache_readonly

import pandas.lib as lib
import pandas.tslib as tslib
Expand Down Expand Up @@ -1728,6 +1723,72 @@ def _try_kind_sort(arr):
else:
return result.__finalize__(self)

def nlargest(self, n=5, take_last=False):
"""Return the largest `n` elements.

Parameters
----------
n : int
Return this many descending sorted values
take_last : bool
Where there are duplicate values, take the last duplicate

Returns
-------
top_n : Series
The n largest values in the Series, in sorted order

Notes
-----
Faster than ``.order(ascending=False).head(n)`` for small `n` relative
to the size of the ``Series`` object.

See Also
--------
Series.nsmallest

Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> s = pd.Series(np.random.randn(1e6))
>>> s.nlargest(10) # only sorts up to the N requested
"""
return select_n(self, n=n, take_last=take_last, method='nlargest')

def nsmallest(self, n=5, take_last=False):
"""Return the smallest `n` elements.

Parameters
----------
n : int
Return this many ascending sorted values
take_last : bool
Where there are duplicate values, take the last duplicate

Returns
-------
bottom_n : Series
The n smallest values in the Series, in sorted order

Notes
-----
Faster than ``.order().head(n)`` for small `n` relative to
the size of the ``Series`` object.

See Also
--------
Series.nlargest

Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> s = pd.Series(np.random.randn(1e6))
>>> s.nsmallest(10) # only sorts up to the N requested
"""
return select_n(self, n=n, take_last=take_last, method='nsmallest')

def sortlevel(self, level=0, ascending=True, sort_remaining=True):
"""
Sort Series with MultiIndex by chosen level. Data will be
Expand Down
Loading