ENH nlargest and nsmallest Series methods

hayd · cpcloud · commit 1d9bc57d8b90 · 2014-05-14T16:28:03.000-04:00
diff --git a/doc/source/v0.13.1.txt b/doc/source/v0.13.1.txt
@@ -128,6 +128,7 @@ API changes
       import pandas.core.common as com
       com.array_equivalent(np.array([0, np.nan]), np.array([0, np.nan]))
       np.array_equal(np.array([0, np.nan]), np.array([0, np.nan]))
+- Add nsmallest and nlargest Series methods (:issue:`3960`)
 
 - ``DataFrame.apply`` will use the ``reduce`` argument to determine whether a
   ``Series`` or a ``DataFrame`` should be returned when the ``DataFrame`` is
diff --git a/pandas/algos.pyx b/pandas/algos.pyx
@@ -21,6 +21,9 @@ from numpy cimport NPY_FLOAT16 as NPY_float16
 from numpy cimport NPY_FLOAT32 as NPY_float32
 from numpy cimport NPY_FLOAT64 as NPY_float64
 
+from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
+                    uint32_t, uint64_t, float16_t, float32_t, float64_t)
+
 int8 = np.dtype(np.int8)
 int16 = np.dtype(np.int16)
 int32 = np.dtype(np.int32)
@@ -736,16 +739,34 @@ def _check_minp(win, minp, N):
 # Physical description: 366 p.
 #               Series: Prentice-Hall Series in Automatic Computation
 
-def kth_smallest(ndarray[double_t] a, Py_ssize_t k):
-    cdef:
-        Py_ssize_t i,j,l,m,n
-        double_t x, t
+ctypedef fused kth_type:
+    int8_t
+    int16_t
+    int32_t
+    int64_t
 
-    n = len(a)
+    uint8_t
+    uint16_t
+    uint32_t
+    uint64_t
 
-    l = 0
-    m = n-1
-    while (l<m):
+    float32_t
+    float64_t
+
+
+cdef void swap_kth(kth_type *a, kth_type *b):
+    cdef kth_type t
+    t = a[0]
+    a[0] = b[0]
+    b[0] = t
+
+
+cpdef kth_type kth_smallest(kth_type[:] a, Py_ssize_t k):
+    cdef:
+        Py_ssize_t i, j, l = 0, n = a.size, m = n - 1
+        kth_type x
+
+    while l < m:
         x = a[k]
         i = l
         j = m
@@ -754,9 +775,7 @@ def kth_smallest(ndarray[double_t] a, Py_ssize_t k):
             while a[i] < x: i += 1
             while x < a[j]: j -= 1
             if i <= j:
-                t = a[i]
-                a[i] = a[j]
-                a[j] = t
+                swap_kth(&a[i], &a[j])
                 i += 1; j -= 1
 
             if i > j: break
@@ -765,6 +784,7 @@ def kth_smallest(ndarray[double_t] a, Py_ssize_t k):
         if k < i: m = j
     return a[k]
 
+
 cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n):
     cdef:
         Py_ssize_t i,j,l,m
@@ -781,9 +801,7 @@ cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n):
             while a[i] < x: i += 1
             while x < a[j]: j -= 1
             if i <= j:
-                t = a[i]
-                a[i] = a[j]
-                a[j] = t
+                swap_kth(&a[i], &a[j])
                 i += 1; j -= 1
 
             if i > j: break
@@ -793,22 +811,22 @@ cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n):
     return a[k]
 
 
-def median(ndarray arr):
+cpdef kth_type median(kth_type[:] arr):
     '''
     A faster median
     '''
-    cdef int n = len(arr)
+    cdef Py_ssize_t n = arr.size
 
-    if len(arr) == 0:
+    if n == 0:
         return np.NaN
 
     arr = arr.copy()
 
     if n % 2:
-        return kth_smallest(arr, n / 2)
+        return kth_smallest(arr, n // 2)
     else:
-        return (kth_smallest(arr, n / 2) +
-                kth_smallest(arr, n / 2 - 1)) / 2
+        return (kth_smallest(arr, n // 2) +
+                kth_smallest(arr, n // 2 - 1)) / 2
 
 
 # -------------- Min, Max subsequence
@@ -2226,7 +2244,7 @@ cdef inline float64_t _median_linear(float64_t* a, int n):
 
 
     if n % 2:
-        result = kth_smallest_c(a, n / 2, n)
+        result = kth_smallest_c( a, n / 2, n)
     else:
         result = (kth_smallest_c(a, n / 2, n) +
                   kth_smallest_c(a, n / 2 - 1, n)) / 2
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -6,7 +6,6 @@
 # pylint: disable=E1101,E1103
 # pylint: disable=W0703,W0622,W0613,W0201
 
-import operator
 import types
 import warnings
 
@@ -15,29 +14,24 @@
 import numpy.ma as ma
 
 from pandas.core.common import (isnull, notnull, _is_bool_indexer,
-                                _default_index, _maybe_promote, _maybe_upcast,
-                                _asarray_tuplesafe, is_integer_dtype,
-                                _NS_DTYPE, _TD_DTYPE,
-                                _infer_dtype_from_scalar, is_list_like,
-                                _values_from_object,
+                                _default_index, _maybe_upcast,
+                                _asarray_tuplesafe, _infer_dtype_from_scalar,
+                                is_list_like, _values_from_object,
                                 _possibly_cast_to_datetime, _possibly_castable,
-                                _possibly_convert_platform,
-                                _try_sort,
+                                _possibly_convert_platform, _try_sort,
                                 ABCSparseArray, _maybe_match_name,
                                 _ensure_object, SettingWithCopyError)
 from pandas.core.index import (Index, MultiIndex, InvalidIndexError,
                                _ensure_index)
-from pandas.core.indexing import (
-    _check_bool_indexer,
-    _is_index_slice, _maybe_convert_indices)
+from pandas.core.indexing import _check_bool_indexer, _maybe_convert_indices
 from pandas.core import generic, base
 from pandas.core.internals import SingleBlockManager
 from pandas.core.categorical import Categorical
 from pandas.tseries.index import DatetimeIndex
 from pandas.tseries.period import PeriodIndex, Period
 from pandas import compat
 from pandas.util.terminal import get_terminal_size
-from pandas.compat import zip, lzip, u, OrderedDict
+from pandas.compat import zip, u, OrderedDict
 
 import pandas.core.array as pa
 import pandas.core.ops as ops
@@ -46,7 +40,7 @@
 import pandas.core.datetools as datetools
 import pandas.core.format as fmt
 import pandas.core.nanops as nanops
-from pandas.util.decorators import Appender, Substitution, cache_readonly
+from pandas.util.decorators import Appender, cache_readonly
 
 import pandas.lib as lib
 import pandas.tslib as tslib
@@ -1705,7 +1699,17 @@ def _try_kind_sort(arr):
         good = ~bad
         idx = pa.arange(len(self))
 
-        argsorted = _try_kind_sort(arr[good])
+        def _try_kind_sort(arr, kind='mergesort'):
+            # easier to ask forgiveness than permission
+            try:
+                # if kind==mergesort, it can fail for object dtype
+                return arr.argsort(kind=kind)
+            except TypeError:
+                # stable sort not available for object dtype
+                # uses the argsort default quicksort
+                return arr.argsort(kind='quicksort')
+
+        argsorted = _try_kind_sort(arr[good], kind=kind)
 
         if not ascending:
             argsorted = argsorted[::-1]
@@ -1728,6 +1732,51 @@ def _try_kind_sort(arr):
         else:
             return result.__finalize__(self)
 
+    def nlargest(self, n=5, take_last=False):
+        '''
+        Returns the largest n rows:
+
+        May be faster than .order(ascending=False).head(n).
+
+        '''
+        # TODO remove need for dropna ?
+        dropped = self.dropna()
+
+        from pandas.tools.util import nlargest
+
+        if dropped.dtype == object:
+            try:
+                dropped = dropped.astype(float)
+            except (NotImplementedError, TypeError):
+                return dropped.order(ascending=False).head(n)
+
+        inds = nlargest(dropped.values, n, take_last)
+        if len(inds) == 0:
+            # TODO remove this special case
+            return dropped[[]]
+        return dropped.iloc[inds]
+
+    def nsmallest(self, n=5, take_last=False):
+        '''
+        Returns the smallest n rows.
+
+        May be faster than .order().head(n).
+
+        '''
+        # TODO remove need for dropna ?
+        dropped = self.dropna()
+
+        from pandas.tools.util import nsmallest
+        try:
+            inds = nsmallest(dropped.values, n, take_last)
+        except NotImplementedError:
+            return dropped.order().head(n)
+
+        if len(inds) == 0:
+            # TODO remove this special case
+            return dropped[[]]
+        return dropped.iloc[inds]
+
     def sortlevel(self, level=0, ascending=True, sort_remaining=True):
         """
         Sort Series with MultiIndex by chosen level. Data will be
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -3998,6 +3998,39 @@ def test_order(self):
         ordered = ts.order(ascending=False, na_position='first')
         assert_almost_equal(expected, ordered.valid().values)
 
+    def test_nsmallest_nlargest(self):
+        # float, int, datetime64 (use i8), timedelts64 (same),
+        # object that are numbers, object that are strings
+
+        s_list = [Series([3, 2, 1, 2, 5]),
+                  Series([3., 2., 1., 2., 5.]),
+                  Series([3., 2, 1, 2, 5], dtype='object'),
+                  Series([3., 2, 1, 2, '5'], dtype='object'),
+                  Series(pd.to_datetime(['2003', '2002', '2001', '2002', '2005']))]
+
+        for s in s_list:
+
+            assert_series_equal(s.nsmallest(2), s.iloc[[2, 1]])
+            assert_series_equal(s.nsmallest(2, take_last=True), s.iloc[[2, 3]])
+
+            assert_series_equal(s.nlargest(3), s.iloc[[4, 0, 1]])
+            assert_series_equal(s.nlargest(3, take_last=True), s.iloc[[4, 0, 3]])
+
+            empty = s.iloc[0:0]
+            assert_series_equal(s.nsmallest(0), empty)
+            assert_series_equal(s.nsmallest(-1), empty)
+            assert_series_equal(s.nlargest(0), empty)
+            assert_series_equal(s.nlargest(-1), empty)
+
+            assert_series_equal(s.nsmallest(len(s)), s.order())
+            assert_series_equal(s.nsmallest(len(s) + 1), s.order())
+            assert_series_equal(s.nlargest(len(s)), s.iloc[[4, 0, 1, 3, 2]])
+            assert_series_equal(s.nlargest(len(s) + 1), s.iloc[[4, 0, 1, 3, 2]])
+
+        s = Series([3., np.nan, 1, 2, 5])
+        assert_series_equal(s.nlargest(), s.iloc[[4, 0, 3, 2]])
+        assert_series_equal(s.nsmallest(), s.iloc[[2, 3, 0, 4]])
+
     def test_rank(self):
         from pandas.compat.scipy import rankdata
 
diff --git a/pandas/tools/util.py b/pandas/tools/util.py
@@ -1,6 +1,9 @@
 from pandas.compat import reduce
 from pandas.core.index import Index
 import numpy as np
+from pandas import algos
+import pandas.core.common as com
+
 
 def match(needles, haystack):
     haystack = Index(haystack)
@@ -17,7 +20,7 @@ def cartesian_product(X):
     --------
     >>> cartesian_product([list('ABC'), [1, 2]])
     [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'),
- 	array([1, 2, 1, 2, 1, 2])]
+    array([1, 2, 1, 2, 1, 2])]
 
     '''
 
@@ -43,3 +46,68 @@ def compose(*funcs):
     """Compose 2 or more callables"""
     assert len(funcs) > 1, 'At least 2 callables must be passed to compose'
     return reduce(_compose2, funcs)
+
+
+_dtype_map = {'datetime64[ns]': 'int64', 'int64': 'int64',
+              'float64': 'float64'}
+
+
+def nsmallest(arr, n=5, take_last=False):
+    '''
+    Find the indices of the n smallest values of a numpy array.
+
+    Note: Fails silently with NaN.
+
+    '''
+    if n <= 0:
+        return np.array([])  # empty
+    elif n >= len(arr):
+        n = len(arr)
+
+    if arr.dtype == object:
+        # just sort and take n
+        return arr.argsort(kind='mergesort')[:n]
+
+    try:
+        dtype = _dtype_map[str(arr.dtype)]
+    except KeyError:
+        raise NotImplementedError("Not implemented for %s dtype, "
+                                  "perhaps convert to int64 or float64, "
+                                  "or use .order().head(n)") % arr.dtype
+
+    arr = arr.view(dtype)
+
+    if take_last:
+        arr = arr[::-1]
+
+    kth_val = algos.kth_smallest(arr.copy(), n - 1)
+
+    ns, = np.nonzero(arr <= kth_val)
+    inds = ns[arr[ns].argsort(kind='mergesort')][:n]
+
+    if take_last:
+        # reverse indices
+        return len(arr) - 1 - inds
+    return inds
+
+
+def nlargest(arr, n=5, take_last=False):
+    '''
+    Find the indices of the n largest values of a numpy array.
+
+    Note: Fails silently with NaN.
+
+    '''
+    if n <= 0:
+        return np.array([])  # empty
+
+    n = min(n, len(arr))
+
+    if arr.dtype == object:
+        try:
+            arr = arr.astype(float)
+        except:
+            raise TypeError("An object array must convert to float.")
+
+    arr = -arr.view(_dtype_map[str(arr.dtype)])
+    return nsmallest(arr, n, take_last=take_last)
diff --git a/vb_suite/series_methods.py b/vb_suite/series_methods.py