pandas-dev · cpcloud · May 14, 2014 · Nov 17, 2013 · May 12, 2014 · May 14, 2014
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -1311,6 +1311,21 @@ Some other sorting notes / nuances:
     compatibility with NumPy methods which expect the ``ndarray.sort``
     behavior.
 
+.. versionadded:: 0.14.0
+
+``Series`` has the ``nsmallest`` and ``nlargest`` methods which return the
+smallest or largest :math:`n` values. For a large ``Series`` this can be much
+faster than sorting the entire Series and calling ``head(n)`` on the result.
+
+.. ipython:: python
+
+   s = Series(np.random.permutation(10))
+   s
+   s.order()
+   s.nsmallest(3)
+   s.nlargest(3)
+
+
 Sorting by a multi-index column
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt
@@ -643,6 +643,7 @@ Enhancements
                 values='Quantity', aggfunc=np.sum)
 
 - str.wrap implemented (:issue:`6999`)
+- Add nsmallest and nlargest Series methods (:issue:`3960`)
 
 - `PeriodIndex` fully supports partial string indexing like `DatetimeIndex` (:issue:`7043`)
 

diff --git a/pandas/algos.pyx b/pandas/algos.pyx
@@ -21,6 +21,9 @@ from numpy cimport NPY_FLOAT16 as NPY_float16
 from numpy cimport NPY_FLOAT32 as NPY_float32
 from numpy cimport NPY_FLOAT64 as NPY_float64
 
+from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
+                    uint32_t, uint64_t, float16_t, float32_t, float64_t)
+
 int8 = np.dtype(np.int8)
 int16 = np.dtype(np.int16)
 int32 = np.dtype(np.int32)
@@ -736,16 +739,43 @@ def _check_minp(win, minp, N):
 # Physical description: 366 p.
 #               Series: Prentice-Hall Series in Automatic Computation
 
-def kth_smallest(ndarray[double_t] a, Py_ssize_t k):
-    cdef:
-        Py_ssize_t i,j,l,m,n
-        double_t x, t
 
-    n = len(a)
+ctypedef fused numeric:
+    int8_t
+    int16_t
+    int32_t
+    int64_t
+
+    uint8_t
+    uint16_t
+    uint32_t
+    uint64_t
+
+    float32_t
+    float64_t
+
+
+cdef inline Py_ssize_t swap(numeric *a, numeric *b) except -1:
+    cdef numeric t
+
+    # cython doesn't allow pointer dereference so use array syntax
+    t = a[0]
+    a[0] = b[0]
+    b[0] = t
+    return 0
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k):
+    cdef:
+        Py_ssize_t i, j, l, m, n = a.size
+        numeric x
 
     l = 0
-    m = n-1
-    while (l<m):
+    m = n - 1
+
+    while l < m:
         x = a[k]
         i = l
         j = m
@@ -754,9 +784,7 @@ def kth_smallest(ndarray[double_t] a, Py_ssize_t k):
             while a[i] < x: i += 1
             while x < a[j]: j -= 1
             if i <= j:
-                t = a[i]
-                a[i] = a[j]
-                a[j] = t
+                swap(&a[i], &a[j])
                 i += 1; j -= 1
 
             if i > j: break
@@ -765,6 +793,7 @@ def kth_smallest(ndarray[double_t] a, Py_ssize_t k):
         if k < i: m = j
     return a[k]
 
+
 cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n):
     cdef:
         Py_ssize_t i,j,l,m
@@ -781,9 +810,7 @@ cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n):
             while a[i] < x: i += 1
             while x < a[j]: j -= 1
             if i <= j:
-                t = a[i]
-                a[i] = a[j]
-                a[j] = t
+                swap(&a[i], &a[j])
                 i += 1; j -= 1
 
             if i > j: break
@@ -793,22 +820,22 @@ cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n):
     return a[k]
 
 
-def median(ndarray arr):
+cpdef numeric median(numeric[:] arr):
     '''
     A faster median
     '''
-    cdef int n = len(arr)
+    cdef Py_ssize_t n = arr.size
 
-    if len(arr) == 0:
+    if n == 0:
         return np.NaN
 
     arr = arr.copy()
 
     if n % 2:
-        return kth_smallest(arr, n / 2)
+        return kth_smallest(arr, n // 2)
     else:
-        return (kth_smallest(arr, n / 2) +
-                kth_smallest(arr, n / 2 - 1)) / 2
+        return (kth_smallest(arr, n // 2) +
+                kth_smallest(arr, n // 2 - 1)) / 2
 
 
 # -------------- Min, Max subsequence
@@ -2226,7 +2253,7 @@ cdef inline float64_t _median_linear(float64_t* a, int n):
 
 
     if n % 2:
-        result = kth_smallest_c(a, n / 2, n)
+        result = kth_smallest_c( a, n / 2, n)
     else:
         result = (kth_smallest_c(a, n / 2, n) +
                   kth_smallest_c(a, n / 2 - 1, n)) / 2

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -9,9 +9,7 @@
 import pandas.core.common as com
 import pandas.algos as algos
 import pandas.hashtable as htable
-import pandas.compat as compat
-from pandas.compat import filter, string_types
-from pandas.util.decorators import deprecate_kwarg
+from pandas.compat import string_types
 
 def match(to_match, values, na_sentinel=-1):
     """
@@ -413,6 +411,90 @@ def group_position(*args):
     return result
 
 
+_dtype_map = {'datetime64[ns]': 'int64', 'timedelta64[ns]': 'int64'}
+
+
+def _finalize_nsmallest(arr, kth_val, n, take_last, narr):
+    ns, = np.nonzero(arr <= kth_val)
+    inds = ns[arr[ns].argsort(kind='mergesort')][:n]
+
+    if take_last:
+        # reverse indices
+        return narr - 1 - inds
+    return inds
+
+
+def nsmallest(arr, n, take_last=False):
+    '''
+    Find the indices of the n smallest values of a numpy array.
+
+    Note: Fails silently with NaN.
+
+    '''
+    if take_last:
+        arr = arr[::-1]
+
+    narr = len(arr)
+    n = min(n, narr)
+
+    sdtype = str(arr.dtype)
+    arr = arr.view(_dtype_map.get(sdtype, sdtype))
+
+    kth_val = algos.kth_smallest(arr.copy(), n - 1)
+    return _finalize_nsmallest(arr, kth_val, n, take_last, narr)
+
+
+def nlargest(arr, n, take_last=False):
+    """
+    Find the indices of the n largest values of a numpy array.
+
+    Note: Fails silently with NaN.
+    """
+    sdtype = str(arr.dtype)
+    arr = arr.view(_dtype_map.get(sdtype, sdtype))
+    return nsmallest(-arr, n, take_last=take_last)
+
+
+def select_n_slow(dropped, n, take_last, method):
+    reverse_it = take_last or method == 'nlargest'
+    ascending = method == 'nsmallest'
+    slc = np.s_[::-1] if reverse_it else np.s_[:]
+    return dropped[slc].order(ascending=ascending).head(n)
+
+
+_select_methods = {'nsmallest': nsmallest, 'nlargest': nlargest}
+
+
+def select_n(series, n, take_last, method):
+    """Implement n largest/smallest.
+
+    Parameters
+    ----------
+    n : int
+    take_last : bool
+    method : str, {'nlargest', 'nsmallest'}
+
+    Returns
+    -------
+    nordered : Series
+    """
+    dtype = series.dtype
+    if not issubclass(dtype.type, (np.integer, np.floating, np.datetime64,
+                                   np.timedelta64)):
+        raise TypeError("Cannot use method %r with dtype %s" % (method, dtype))
+
+    if n <= 0:
+        return series[[]]
+
+    dropped = series.dropna()
+
+    if n >= len(series):
+        return select_n_slow(dropped, n, take_last, method)
+
+    inds = _select_methods[method](dropped.values, n, take_last)
+    return dropped.iloc[inds]
+
+
 _rank1d_functions = {
     'float64': algos.rank_1d_float64,
     'int64': algos.rank_1d_int64,

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -6,7 +6,6 @@
 # pylint: disable=E1101,E1103
 # pylint: disable=W0703,W0622,W0613,W0201
 
-import operator
 import types
 import warnings
 
@@ -15,38 +14,34 @@
 import numpy.ma as ma
 
 from pandas.core.common import (isnull, notnull, _is_bool_indexer,
-                                _default_index, _maybe_promote, _maybe_upcast,
-                                _asarray_tuplesafe, is_integer_dtype,
-                                _NS_DTYPE, _TD_DTYPE,
-                                _infer_dtype_from_scalar, is_list_like,
-                                _values_from_object,
+                                _default_index, _maybe_upcast,
+                                _asarray_tuplesafe, _infer_dtype_from_scalar,
+                                is_list_like, _values_from_object,
                                 _possibly_cast_to_datetime, _possibly_castable,
-                                _possibly_convert_platform,
-                                _try_sort,
+                                _possibly_convert_platform, _try_sort,
                                 ABCSparseArray, _maybe_match_name,
                                 _ensure_object, SettingWithCopyError)
 from pandas.core.index import (Index, MultiIndex, InvalidIndexError,
                                _ensure_index)
-from pandas.core.indexing import (
-    _check_bool_indexer,
-    _is_index_slice, _maybe_convert_indices)
+from pandas.core.indexing import _check_bool_indexer, _maybe_convert_indices
 from pandas.core import generic, base
 from pandas.core.internals import SingleBlockManager
 from pandas.core.categorical import Categorical
 from pandas.tseries.index import DatetimeIndex
 from pandas.tseries.period import PeriodIndex, Period
 from pandas import compat
 from pandas.util.terminal import get_terminal_size
-from pandas.compat import zip, lzip, u, OrderedDict
+from pandas.compat import zip, u, OrderedDict
 
 import pandas.core.array as pa
 import pandas.core.ops as ops
+from pandas.core.algorithms import select_n
 
 import pandas.core.common as com
 import pandas.core.datetools as datetools
 import pandas.core.format as fmt
 import pandas.core.nanops as nanops
-from pandas.util.decorators import Appender, Substitution, cache_readonly
+from pandas.util.decorators import Appender, cache_readonly
 
 import pandas.lib as lib
 import pandas.tslib as tslib
@@ -1728,6 +1723,72 @@ def _try_kind_sort(arr):
         else:
             return result.__finalize__(self)
 
+    def nlargest(self, n=5, take_last=False):
+        """Return the largest `n` elements.
+
+        Parameters
+        ----------
+        n : int
+            Return this many descending sorted values
+        take_last : bool
+            Where there are duplicate values, take the last duplicate
+
+        Returns
+        -------
+        top_n : Series
+            The n largest values in the Series, in sorted order
+
+        Notes
+        -----
+        Faster than ``.order(ascending=False).head(n)`` for small `n` relative
+        to the size of the ``Series`` object.
+
+        See Also
+        --------
+        Series.nsmallest
+
+        Examples
+        --------
+        >>> import pandas as pd
+        >>> import numpy as np
+        >>> s = pd.Series(np.random.randn(1e6))
+        >>> s.nlargest(10)  # only sorts up to the N requested
+        """
+        return select_n(self, n=n, take_last=take_last, method='nlargest')
+
+    def nsmallest(self, n=5, take_last=False):
+        """Return the smallest `n` elements.
+
+        Parameters
+        ----------
+        n : int
+            Return this many ascending sorted values
+        take_last : bool
+            Where there are duplicate values, take the last duplicate
+
+        Returns
+        -------
+        bottom_n : Series
+            The n smallest values in the Series, in sorted order
+
+        Notes
+        -----
+        Faster than ``.order().head(n)`` for small `n` relative to
+        the size of the ``Series`` object.
+
+        See Also
+        --------
+        Series.nlargest
+
+        Examples
+        --------
+        >>> import pandas as pd
+        >>> import numpy as np
+        >>> s = pd.Series(np.random.randn(1e6))
+        >>> s.nsmallest(10)  # only sorts up to the N requested
+        """
+        return select_n(self, n=n, take_last=take_last, method='nsmallest')
+
     def sortlevel(self, level=0, ascending=True, sort_remaining=True):
         """
         Sort Series with MultiIndex by chosen level. Data will be