Skip to content

Commit a909e16

Browse files
committed
DOC/REF: add docstrings and DRY it up
1 parent 1d9bc57 commit a909e16

File tree

7 files changed

+197
-130
lines changed

7 files changed

+197
-130
lines changed

doc/source/v0.13.1.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,6 @@ API changes
128128
import pandas.core.common as com
129129
com.array_equivalent(np.array([0, np.nan]), np.array([0, np.nan]))
130130
np.array_equal(np.array([0, np.nan]), np.array([0, np.nan]))
131-
- Add nsmallest and nlargest Series methods (:issue:`3960`)
132131

133132
- ``DataFrame.apply`` will use the ``reduce`` argument to determine whether a
134133
``Series`` or a ``DataFrame`` should be returned when the ``DataFrame`` is

doc/source/v0.14.0.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -643,6 +643,7 @@ Enhancements
643643
values='Quantity', aggfunc=np.sum)
644644

645645
- str.wrap implemented (:issue:`6999`)
646+
- Add nsmallest and nlargest Series methods (:issue:`3960`)
646647

647648
- `PeriodIndex` fully supports partial string indexing like `DatetimeIndex` (:issue:`7043`)
648649

pandas/algos.pyx

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -739,7 +739,8 @@ def _check_minp(win, minp, N):
739739
# Physical description: 366 p.
740740
# Series: Prentice-Hall Series in Automatic Computation
741741

742-
ctypedef fused kth_type:
742+
743+
ctypedef fused numeric:
743744
int8_t
744745
int16_t
745746
int32_t
@@ -754,17 +755,25 @@ ctypedef fused kth_type:
754755
float64_t
755756

756757

757-
cdef void swap_kth(kth_type *a, kth_type *b):
758-
cdef kth_type t
758+
cdef inline Py_ssize_t swap(numeric *a, numeric *b) except -1:
759+
cdef numeric t
760+
761+
# cython doesn't allow pointer dereference so use array syntax
759762
t = a[0]
760763
a[0] = b[0]
761764
b[0] = t
765+
return 0
762766

763767

764-
cpdef kth_type kth_smallest(kth_type[:] a, Py_ssize_t k):
768+
@cython.boundscheck(False)
769+
@cython.wraparound(False)
770+
cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k):
765771
cdef:
766-
Py_ssize_t i, j, l = 0, n = a.size, m = n - 1
767-
kth_type x
772+
Py_ssize_t i, j, l, m, n = a.size
773+
numeric x
774+
775+
l = 0
776+
m = n - 1
768777

769778
while l < m:
770779
x = a[k]
@@ -775,7 +784,7 @@ cpdef kth_type kth_smallest(kth_type[:] a, Py_ssize_t k):
775784
while a[i] < x: i += 1
776785
while x < a[j]: j -= 1
777786
if i <= j:
778-
swap_kth(&a[i], &a[j])
787+
swap(&a[i], &a[j])
779788
i += 1; j -= 1
780789

781790
if i > j: break
@@ -801,7 +810,7 @@ cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n):
801810
while a[i] < x: i += 1
802811
while x < a[j]: j -= 1
803812
if i <= j:
804-
swap_kth(&a[i], &a[j])
813+
swap(&a[i], &a[j])
805814
i += 1; j -= 1
806815

807816
if i > j: break
@@ -811,7 +820,7 @@ cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n):
811820
return a[k]
812821

813822

814-
cpdef kth_type median(kth_type[:] arr):
823+
cpdef numeric median(numeric[:] arr):
815824
'''
816825
A faster median
817826
'''

pandas/core/algorithms.py

Lines changed: 85 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,7 @@
99
import pandas.core.common as com
1010
import pandas.algos as algos
1111
import pandas.hashtable as htable
12-
import pandas.compat as compat
13-
from pandas.compat import filter, string_types
14-
from pandas.util.decorators import deprecate_kwarg
12+
from pandas.compat import string_types
1513

1614
def match(to_match, values, na_sentinel=-1):
1715
"""
@@ -413,6 +411,90 @@ def group_position(*args):
413411
return result
414412

415413

414+
_dtype_map = {'datetime64[ns]': 'int64', 'timedelta64[ns]': 'int64'}
415+
416+
417+
def _finalize_nsmallest(arr, kth_val, n, take_last, narr):
418+
ns, = np.nonzero(arr <= kth_val)
419+
inds = ns[arr[ns].argsort(kind='mergesort')][:n]
420+
421+
if take_last:
422+
# reverse indices
423+
return narr - 1 - inds
424+
return inds
425+
426+
427+
def nsmallest(arr, n, take_last=False):
428+
'''
429+
Find the indices of the n smallest values of a numpy array.
430+
431+
Note: Fails silently with NaN.
432+
433+
'''
434+
if take_last:
435+
arr = arr[::-1]
436+
437+
narr = len(arr)
438+
n = min(n, narr)
439+
440+
sdtype = str(arr.dtype)
441+
arr = arr.view(_dtype_map.get(sdtype, sdtype))
442+
443+
kth_val = algos.kth_smallest(arr.copy(), n - 1)
444+
return _finalize_nsmallest(arr, kth_val, n, take_last, narr)
445+
446+
447+
def nlargest(arr, n, take_last=False):
448+
"""
449+
Find the indices of the n largest values of a numpy array.
450+
451+
Note: Fails silently with NaN.
452+
"""
453+
sdtype = str(arr.dtype)
454+
arr = arr.view(_dtype_map.get(sdtype, sdtype))
455+
return nsmallest(-arr, n, take_last=take_last)
456+
457+
458+
def select_n_slow(dropped, n, take_last, method):
459+
reverse_it = take_last or method == 'nlargest'
460+
ascending = method == 'nsmallest'
461+
slc = np.s_[::-1] if reverse_it else np.s_[:]
462+
return dropped[slc].order(ascending=ascending).head(n)
463+
464+
465+
_select_methods = {'nsmallest': nsmallest, 'nlargest': nlargest}
466+
467+
468+
def select_n(series, n, take_last, method):
469+
"""Implement n largest/smallest.
470+
471+
Parameters
472+
----------
473+
n : int
474+
take_last : bool
475+
method : str, {'nlargest', 'nsmallest'}
476+
477+
Returns
478+
-------
479+
nordered : Series
480+
"""
481+
dtype = series.dtype
482+
if not issubclass(dtype.type, (np.integer, np.floating, np.datetime64,
483+
np.timedelta64)):
484+
raise TypeError("Cannot use method %r with dtype %s" % (method, dtype))
485+
486+
if n <= 0:
487+
return series[[]]
488+
489+
dropped = series.dropna()
490+
491+
if n >= len(series):
492+
return select_n_slow(dropped, n, take_last, method)
493+
494+
inds = _select_methods[method](dropped.values, n, take_last)
495+
return dropped.iloc[inds]
496+
497+
416498
_rank1d_functions = {
417499
'float64': algos.rank_1d_float64,
418500
'int64': algos.rank_1d_int64,

pandas/core/series.py

Lines changed: 56 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535

3636
import pandas.core.array as pa
3737
import pandas.core.ops as ops
38+
from pandas.core.algorithms import select_n
3839

3940
import pandas.core.common as com
4041
import pandas.core.datetools as datetools
@@ -1699,17 +1700,7 @@ def _try_kind_sort(arr):
16991700
good = ~bad
17001701
idx = pa.arange(len(self))
17011702

1702-
def _try_kind_sort(arr, kind='mergesort'):
1703-
# easier to ask forgiveness than permission
1704-
try:
1705-
# if kind==mergesort, it can fail for object dtype
1706-
return arr.argsort(kind=kind)
1707-
except TypeError:
1708-
# stable sort not available for object dtype
1709-
# uses the argsort default quicksort
1710-
return arr.argsort(kind='quicksort')
1711-
1712-
argsorted = _try_kind_sort(arr[good], kind=kind)
1703+
argsorted = _try_kind_sort(arr[good])
17131704

17141705
if not ascending:
17151706
argsorted = argsorted[::-1]
@@ -1733,49 +1724,70 @@ def _try_kind_sort(arr, kind='mergesort'):
17331724
return result.__finalize__(self)
17341725

17351726
def nlargest(self, n=5, take_last=False):
1736-
'''
1737-
Returns the largest n rows:
1727+
"""Return the largest `n` elements.
17381728
1739-
May be faster than .order(ascending=False).head(n).
1729+
Parameters
1730+
----------
1731+
n : int
1732+
Return this many descending sorted values
1733+
take_last : bool
1734+
Where there are duplicate values, take the last duplicate
17401735
1741-
'''
1742-
# TODO remove need for dropna ?
1743-
dropped = self.dropna()
1736+
Returns
1737+
-------
1738+
top_n : Series
1739+
The n largest values in the Series, in sorted order
17441740
1745-
from pandas.tools.util import nlargest
1741+
Notes
1742+
-----
1743+
Faster than ``.order(ascending=False).head(n)`` for small `n` relative
1744+
to the size of the ``Series`` object.
17461745
1747-
if dropped.dtype == object:
1748-
try:
1749-
dropped = dropped.astype(float)
1750-
except (NotImplementedError, TypeError):
1751-
return dropped.order(ascending=False).head(n)
1746+
See Also
1747+
--------
1748+
Series.nsmallest
17521749
1753-
inds = nlargest(dropped.values, n, take_last)
1754-
if len(inds) == 0:
1755-
# TODO remove this special case
1756-
return dropped[[]]
1757-
return dropped.iloc[inds]
1750+
Examples
1751+
--------
1752+
>>> import pandas as pd
1753+
>>> import numpy as np
1754+
>>> s = pd.Series(np.random.randn(1e6))
1755+
>>> s.nlargest(10) # only sorts up to the N requested
1756+
"""
1757+
return select_n(self, n=n, take_last=take_last, method='nlargest')
17581758

17591759
def nsmallest(self, n=5, take_last=False):
1760-
'''
1761-
Returns the smallest n rows.
1760+
"""Return the smallest `n` elements.
17621761
1763-
May be faster than .order().head(n).
1762+
Parameters
1763+
----------
1764+
n : int
1765+
Return this many ascending sorted values
1766+
take_last : bool
1767+
Where there are duplicate values, take the last duplicate
17641768
1765-
'''
1766-
# TODO remove need for dropna ?
1767-
dropped = self.dropna()
1769+
Returns
1770+
-------
1771+
bottom_n : Series
1772+
The n smallest values in the Series, in sorted order
17681773
1769-
from pandas.tools.util import nsmallest
1770-
try:
1771-
inds = nsmallest(dropped.values, n, take_last)
1772-
except NotImplementedError:
1773-
return dropped.order().head(n)
1774-
1775-
if len(inds) == 0:
1776-
# TODO remove this special case
1777-
return dropped[[]]
1778-
return dropped.iloc[inds]
1774+
Notes
1775+
-----
1776+
Faster than ``.order().head(n)`` for small `n` relative to
1777+
the size of the ``Series`` object.
1778+
1779+
See Also
1780+
--------
1781+
Series.nlargest
1782+
1783+
Examples
1784+
--------
1785+
>>> import pandas as pd
1786+
>>> import numpy as np
1787+
>>> s = pd.Series(np.random.randn(1e6))
1788+
>>> s.nsmallest(10) # only sorts up to the N requested
1789+
"""
1790+
return select_n(self, n=n, take_last=take_last, method='nsmallest')
17791791

17801792
def sortlevel(self, level=0, ascending=True, sort_remaining=True):
17811793
"""

pandas/tests/test_series.py

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4002,19 +4002,47 @@ def test_nsmallest_nlargest(self):
40024002
# float, int, datetime64 (use i8), timedelts64 (same),
40034003
# object that are numbers, object that are strings
40044004

4005-
s_list = [Series([3, 2, 1, 2, 5]),
4006-
Series([3., 2., 1., 2., 5.]),
4007-
Series([3., 2, 1, 2, 5], dtype='object'),
4008-
Series([3., 2, 1, 2, '5'], dtype='object'),
4009-
Series(pd.to_datetime(['2003', '2002', '2001', '2002', '2005']))]
4005+
base = [3, 2, 1, 2, 5]
4006+
4007+
s_list = [
4008+
Series(base, dtype='int8'),
4009+
Series(base, dtype='int16'),
4010+
Series(base, dtype='int32'),
4011+
Series(base, dtype='int64'),
4012+
Series(base, dtype='float32'),
4013+
Series(base, dtype='float64'),
4014+
Series(base, dtype='uint8'),
4015+
Series(base, dtype='uint16'),
4016+
Series(base, dtype='uint32'),
4017+
Series(base, dtype='uint64'),
4018+
Series(base).astype('timedelta64[ns]'),
4019+
Series(pd.to_datetime(['2003', '2002', '2001', '2002', '2005'])),
4020+
]
4021+
4022+
raising = [
4023+
Series([3., 2, 1, 2, '5'], dtype='object'),
4024+
Series([3., 2, 1, 2, 5], dtype='object'),
4025+
Series([3., 2, 1, 2, 5], dtype='complex256'),
4026+
Series([3., 2, 1, 2, 5], dtype='complex128'),
4027+
]
4028+
4029+
for r in raising:
4030+
dt = r.dtype
4031+
msg = "Cannot use method 'n(larg|small)est' with dtype %s" % dt
4032+
args = 2, len(r), 0, -1
4033+
methods = r.nlargest, r.nsmallest
4034+
for method, arg in product(methods, args):
4035+
with tm.assertRaisesRegexp(TypeError, msg):
4036+
method(arg)
40104037

40114038
for s in s_list:
40124039

40134040
assert_series_equal(s.nsmallest(2), s.iloc[[2, 1]])
40144041
assert_series_equal(s.nsmallest(2, take_last=True), s.iloc[[2, 3]])
40154042

40164043
assert_series_equal(s.nlargest(3), s.iloc[[4, 0, 1]])
4017-
assert_series_equal(s.nlargest(3, take_last=True), s.iloc[[4, 0, 3]])
4044+
assert_series_equal(s.nlargest(3, take_last=True),
4045+
s.iloc[[4, 0, 3]])
40184046

40194047
empty = s.iloc[0:0]
40204048
assert_series_equal(s.nsmallest(0), empty)
@@ -4025,7 +4053,8 @@ def test_nsmallest_nlargest(self):
40254053
assert_series_equal(s.nsmallest(len(s)), s.order())
40264054
assert_series_equal(s.nsmallest(len(s) + 1), s.order())
40274055
assert_series_equal(s.nlargest(len(s)), s.iloc[[4, 0, 1, 3, 2]])
4028-
assert_series_equal(s.nlargest(len(s) + 1), s.iloc[[4, 0, 1, 3, 2]])
4056+
assert_series_equal(s.nlargest(len(s) + 1),
4057+
s.iloc[[4, 0, 1, 3, 2]])
40294058

40304059
s = Series([3., np.nan, 1, 2, 5])
40314060
assert_series_equal(s.nlargest(), s.iloc[[4, 0, 3, 2]])

0 commit comments

Comments
 (0)