Skip to content

Commit d022e04

Browse files
committed
ENH: group_max/min and bin versions, close #1019
1 parent 2fa4ba8 commit d022e04

File tree

4 files changed

+297
-3
lines changed

4 files changed

+297
-3
lines changed

pandas/core/groupby.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,28 @@ def prod(self):
320320
except Exception:
321321
return self.aggregate(lambda x: np.prod(x, axis=self.axis))
322322

323+
def min(self):
324+
"""
325+
Compute minimum of values, excluding missing values
326+
327+
For multiple groupings, the result index will be a MultiIndex
328+
"""
329+
try:
330+
return self._cython_agg_general('min')
331+
except Exception:
332+
return self.aggregate(lambda x: np.min(x, axis=self.axis))
333+
334+
def max(self):
335+
"""
336+
Compute maximum of values, excluding missing values
337+
338+
For multiple groupings, the result index will be a MultiIndex
339+
"""
340+
try:
341+
return self._cython_agg_general('max')
342+
except Exception:
343+
return self.aggregate(lambda x: np.max(x, axis=self.axis))
344+
323345
def ohlc(self):
324346
"""
325347
Compute sum of values, excluding missing values
@@ -604,6 +626,8 @@ def get_group_levels(self):
604626
_cython_functions = {
605627
'add' : lib.group_add,
606628
'prod' : lib.group_prod,
629+
'min' : lib.group_min,
630+
'max' : lib.group_max,
607631
'mean' : lib.group_mean,
608632
'var' : lib.group_var,
609633
'std' : lib.group_var

pandas/src/groupby.pyx

Lines changed: 237 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -279,9 +279,9 @@ def group_add(ndarray[float64_t, ndim=2] out,
279279
@cython.boundscheck(False)
280280
@cython.wraparound(False)
281281
def group_prod(ndarray[float64_t, ndim=2] out,
282-
ndarray[int32_t] counts,
283-
ndarray[float64_t, ndim=2] values,
284-
ndarray[int32_t] labels):
282+
ndarray[int32_t] counts,
283+
ndarray[float64_t, ndim=2] values,
284+
ndarray[int32_t] labels):
285285
'''
286286
Only aggregates on axis=0
287287
'''
@@ -331,6 +331,124 @@ def group_prod(ndarray[float64_t, ndim=2] out,
331331
out[i, j] = prodx[i, j]
332332

333333

334+
@cython.boundscheck(False)
335+
@cython.wraparound(False)
336+
def group_min(ndarray[float64_t, ndim=2] out,
337+
ndarray[int32_t] counts,
338+
ndarray[float64_t, ndim=2] values,
339+
ndarray[int32_t] labels):
340+
'''
341+
Only aggregates on axis=0
342+
'''
343+
cdef:
344+
Py_ssize_t i, j, N, K, lab
345+
float64_t val, count
346+
ndarray[float64_t, ndim=2] minx, nobs
347+
348+
nobs = np.zeros_like(out)
349+
350+
minx = np.empty_like(out)
351+
minx.fill(np.inf)
352+
353+
N, K = (<object> values).shape
354+
355+
if K > 1:
356+
for i in range(N):
357+
lab = labels[i]
358+
if lab < 0:
359+
continue
360+
361+
counts[lab] += 1
362+
for j in range(K):
363+
val = values[i, j]
364+
365+
# not nan
366+
if val == val:
367+
nobs[lab, j] += 1
368+
if val < minx[lab, j]:
369+
minx[lab, j] = val
370+
else:
371+
for i in range(N):
372+
lab = labels[i]
373+
if lab < 0:
374+
continue
375+
376+
counts[lab] += 1
377+
val = values[i, 0]
378+
379+
# not nan
380+
if val == val:
381+
nobs[lab, 0] += 1
382+
if val < minx[lab, 0]:
383+
minx[lab, 0] = val
384+
385+
for i in range(len(counts)):
386+
for j in range(K):
387+
if nobs[i, j] == 0:
388+
out[i, j] = nan
389+
else:
390+
out[i, j] = minx[i, j]
391+
392+
393+
@cython.boundscheck(False)
394+
@cython.wraparound(False)
395+
def group_max(ndarray[float64_t, ndim=2] out,
396+
ndarray[int32_t] counts,
397+
ndarray[float64_t, ndim=2] values,
398+
ndarray[int32_t] labels):
399+
'''
400+
Only aggregates on axis=0
401+
'''
402+
cdef:
403+
Py_ssize_t i, j, N, K, lab
404+
float64_t val, count
405+
ndarray[float64_t, ndim=2] maxx, nobs
406+
407+
nobs = np.zeros_like(out)
408+
409+
maxx = np.empty_like(out)
410+
maxx.fill(-np.inf)
411+
412+
N, K = (<object> values).shape
413+
414+
if K > 1:
415+
for i in range(N):
416+
lab = labels[i]
417+
if lab < 0:
418+
continue
419+
420+
counts[lab] += 1
421+
for j in range(K):
422+
val = values[i, j]
423+
424+
# not nan
425+
if val == val:
426+
nobs[lab, j] += 1
427+
if val > maxx[lab, j]:
428+
maxx[lab, j] = val
429+
else:
430+
for i in range(N):
431+
lab = labels[i]
432+
if lab < 0:
433+
continue
434+
435+
counts[lab] += 1
436+
val = values[i, 0]
437+
438+
# not nan
439+
if val == val:
440+
nobs[lab, 0] += 1
441+
if val > maxx[lab, 0]:
442+
maxx[lab, 0] = val
443+
444+
for i in range(len(counts)):
445+
for j in range(K):
446+
if nobs[i, j] == 0:
447+
out[i, j] = nan
448+
else:
449+
out[i, j] = maxx[i, j]
450+
451+
334452
@cython.boundscheck(False)
335453
@cython.wraparound(False)
336454
def group_mean(ndarray[float64_t, ndim=2] out,
@@ -621,6 +739,122 @@ def group_prod_bin(ndarray[float64_t, ndim=2] out,
621739
else:
622740
out[i, j] = prodx[i, j]
623741

742+
@cython.boundscheck(False)
743+
@cython.wraparound(False)
744+
def group_min_bin(ndarray[float64_t, ndim=2] out,
745+
ndarray[int32_t] counts,
746+
ndarray[float64_t, ndim=2] values,
747+
ndarray[int32_t] bins):
748+
'''
749+
Only aggregates on axis=0
750+
'''
751+
cdef:
752+
Py_ssize_t i, j, N, K, ngroups, b
753+
float64_t val, count
754+
ndarray[float64_t, ndim=2] minx, nobs
755+
756+
nobs = np.zeros_like(out)
757+
758+
minx = np.empty_like(out)
759+
minx.fill(np.inf)
760+
761+
762+
ngroups = len(bins) + 1
763+
N, K = (<object> values).shape
764+
765+
b = 0
766+
if K > 1:
767+
for i in range(N):
768+
if b < ngroups - 1 and i >= bins[b]:
769+
b += 1
770+
771+
counts[b] += 1
772+
for j in range(K):
773+
val = values[i, j]
774+
775+
# not nan
776+
if val == val:
777+
nobs[b, j] += 1
778+
if val < minx[b, j]:
779+
minx[b, j] = val
780+
else:
781+
for i in range(N):
782+
if b < ngroups - 1 and i >= bins[b]:
783+
b += 1
784+
785+
counts[b] += 1
786+
val = values[i, 0]
787+
788+
# not nan
789+
if val == val:
790+
nobs[b, 0] += 1
791+
if val < minx[b, 0]:
792+
minx[b, 0] = val
793+
794+
for i in range(ngroups):
795+
for j in range(K):
796+
if nobs[i, j] == 0:
797+
out[i, j] = nan
798+
else:
799+
out[i, j] = minx[i, j]
800+
801+
@cython.boundscheck(False)
802+
@cython.wraparound(False)
803+
def group_max_bin(ndarray[float64_t, ndim=2] out,
804+
ndarray[int32_t] counts,
805+
ndarray[float64_t, ndim=2] values,
806+
ndarray[int32_t] bins):
807+
'''
808+
Only aggregates on axis=0
809+
'''
810+
cdef:
811+
Py_ssize_t i, j, N, K, ngroups, b
812+
float64_t val, count
813+
ndarray[float64_t, ndim=2] maxx, nobs
814+
815+
nobs = np.zeros_like(out)
816+
maxx = np.empty_like(out)
817+
maxx.fill(-np.inf)
818+
819+
ngroups = len(bins) + 1
820+
N, K = (<object> values).shape
821+
822+
b = 0
823+
if K > 1:
824+
for i in range(N):
825+
if b < ngroups - 1 and i >= bins[b]:
826+
b += 1
827+
828+
counts[b] += 1
829+
for j in range(K):
830+
val = values[i, j]
831+
832+
# not nan
833+
if val == val:
834+
nobs[b, j] += 1
835+
if val > maxx[b, j]:
836+
maxx[b, j] = val
837+
else:
838+
for i in range(N):
839+
if b < ngroups - 1 and i >= bins[b]:
840+
b += 1
841+
842+
counts[b] += 1
843+
val = values[i, 0]
844+
845+
# not nan
846+
if val == val:
847+
nobs[b, 0] += 1
848+
if val > maxx[b, 0]:
849+
maxx[b, 0] = val
850+
851+
for i in range(ngroups):
852+
for j in range(K):
853+
if nobs[i, j] == 0:
854+
out[i, j] = nan
855+
else:
856+
out[i, j] = maxx[i, j]
857+
624858

625859
@cython.boundscheck(False)
626860
@cython.wraparound(False)

pandas/tests/test_groupby.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -990,6 +990,8 @@ def _testit(op):
990990
_testit(lambda x: x.sum())
991991
_testit(lambda x: x.mean())
992992
_testit(lambda x: x.prod())
993+
_testit(lambda x: x.min())
994+
_testit(lambda x: x.max())
993995

994996
def test_cython_agg_boolean(self):
995997
frame = DataFrame({'a': np.random.randint(0, 5, 50),

pandas/tests/test_tseries.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,40 @@ def test_group_prod_bin():
356356

357357
assert_almost_equal(out, exp)
358358

359+
def test_group_min_bin():
360+
# original group_min
361+
obj = np.random.randn(10, 1)
362+
363+
lab = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 2], dtype=np.int32)
364+
cts = np.array([3, 3, 4], dtype=np.int32)
365+
exp = np.zeros((3, 1), np.float64)
366+
lib.group_min(exp, cts, obj, lab)
367+
368+
# bin-based group_min
369+
bins = np.array([3, 6], dtype=np.int32)
370+
out = np.zeros((3, 1), np.float64)
371+
counts = np.zeros(len(out), dtype=np.int32)
372+
lib.group_min_bin(out, counts, obj, bins)
373+
374+
assert_almost_equal(out, exp)
375+
376+
def test_group_max_bin():
377+
# original group_max
378+
obj = np.random.randn(10, 1)
379+
380+
lab = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 2], dtype=np.int32)
381+
cts = np.array([3, 3, 4], dtype=np.int32)
382+
exp = np.zeros((3, 1), np.float64)
383+
lib.group_max(exp, cts, obj, lab)
384+
385+
# bin-based group_max
386+
bins = np.array([3, 6], dtype=np.int32)
387+
out = np.zeros((3, 1), np.float64)
388+
counts = np.zeros(len(out), dtype=np.int32)
389+
lib.group_max_bin(out, counts, obj, bins)
390+
391+
assert_almost_equal(out, exp)
392+
359393
def test_group_var_bin():
360394
# original group_var
361395
obj = np.random.randn(10, 1)

0 commit comments

Comments
 (0)