Skip to content

Commit dc14378

Browse files
committed
cython optimizations and cleanup
1 parent ca7fb48 commit dc14378

File tree

8 files changed

+107
-89
lines changed

8 files changed

+107
-89
lines changed

pandas/_libs/algos.pyx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ class NegInfinity(object):
7777
__ge__ = lambda self, other: isinstance(other, NegInfinity)
7878

7979

80+
@cython.wraparound(False)
81+
@cython.boundscheck(False)
8082
cpdef ndarray[int64_t, ndim=1] unique_deltas(int64_t[:] arr):
8183
"""
8284
Efficiently find the unique first-differences of the given array.

pandas/_libs/groupby.pyx

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ cdef inline float64_t kth_smallest_c(float64_t* a,
9898

9999
@cython.boundscheck(False)
100100
@cython.wraparound(False)
101-
def group_median_float64(ndarray[float64_t, ndim=2] out,
101+
def group_median_float64(float64_t[:, :] out,
102102
int64_t[:] counts,
103103
ndarray[float64_t, ndim=2] values,
104104
int64_t[:] labels,
@@ -109,7 +109,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
109109
cdef:
110110
Py_ssize_t i, j, N, K, ngroups, size
111111
int64_t[:] _counts
112-
ndarray data
112+
ndarray[float64_t, ndim=2] data
113113
float64_t* ptr
114114

115115
assert min_count == -1, "'min_count' only used in add and prod"
@@ -291,16 +291,16 @@ def group_fillna_indexer(int64_t[:] out, ndarray[int64_t] labels,
291291
"""
292292
cdef:
293293
Py_ssize_t i, N
294-
ndarray[int64_t] sorted_labels
294+
int64_t[:] sorted_labels
295295
int64_t idx, curr_fill_idx=-1, filled_vals=0
296296

297297
N = len(out)
298298

299299
# Make sure all arrays are the same size
300300
assert N == len(labels) == len(mask)
301301

302-
sorted_labels = np.argsort(labels, kind='mergesort').astype(
303-
np.int64, copy=False)
302+
sorted_labels = np.argsort(labels, kind='mergesort').astype(np.int64,
303+
copy=False)
304304
if direction == 'bfill':
305305
sorted_labels = sorted_labels[::-1]
306306

@@ -327,7 +327,7 @@ def group_fillna_indexer(int64_t[:] out, ndarray[int64_t] labels,
327327

328328
@cython.boundscheck(False)
329329
@cython.wraparound(False)
330-
def group_any_all(ndarray[uint8_t] out,
330+
def group_any_all(uint8_t[:] out,
331331
int64_t[:] labels,
332332
uint8_t[:] values,
333333
uint8_t[:] mask,
@@ -370,7 +370,7 @@ def group_any_all(ndarray[uint8_t] out,
370370
else:
371371
raise ValueError("'bool_func' must be either 'any' or 'all'!")
372372

373-
out.fill(1 - flag_val)
373+
out[:] = 1 - flag_val
374374

375375
with nogil:
376376
for i in range(N):

pandas/_libs/groupby_helper.pxi.in

Lines changed: 25 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ def group_mean_{{name}}(ndarray[{{c_type}}, ndim=2] out,
220220

221221
@cython.wraparound(False)
222222
@cython.boundscheck(False)
223-
def group_ohlc_{{name}}(ndarray[{{c_type}}, ndim=2] out,
223+
def group_ohlc_{{name}}({{c_type}}[:, :] out,
224224
int64_t[:] counts,
225225
ndarray[{{c_type}}, ndim=2] values,
226226
int64_t[:] labels,
@@ -246,7 +246,8 @@ def group_ohlc_{{name}}(ndarray[{{c_type}}, ndim=2] out,
246246
if K > 1:
247247
raise NotImplementedError("Argument 'values' must have only "
248248
"one dimension")
249-
out.fill(np.nan)
249+
250+
out[:] = np.nan
250251

251252
with nogil:
252253
for i in range(N):
@@ -304,16 +305,16 @@ def group_last_{{name}}(ndarray[{{c_type}}, ndim=2] out,
304305
cdef:
305306
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
306307
{{c_type}} val
307-
ndarray[{{c_type}}, ndim=2] resx
308-
ndarray[int64_t, ndim=2] nobs
308+
{{c_type}}[:, :] resx
309+
int64_t[:, :] nobs
309310

310311
assert min_count == -1, "'min_count' only used in add and prod"
311312

312313
if not len(values) == len(labels):
313314
raise AssertionError("len(index) != len(labels)")
314315

315316
nobs = np.zeros((<object> out).shape, dtype=np.int64)
316-
{{if name=='object'}}
317+
{{if name == 'object'}}
317318
resx = np.empty((<object> out).shape, dtype=object)
318319
{{else}}
319320
resx = np.empty_like(out)
@@ -361,8 +362,8 @@ def group_nth_{{name}}(ndarray[{{c_type}}, ndim=2] out,
361362
cdef:
362363
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
363364
{{c_type}} val
364-
ndarray[{{c_type}}, ndim=2] resx
365-
ndarray[int64_t, ndim=2] nobs
365+
{{c_type}}[:, :] resx
366+
int64_t[:, :] nobs
366367

367368
assert min_count == -1, "'min_count' only used in add and prod"
368369

@@ -411,7 +412,7 @@ def group_nth_{{name}}(ndarray[{{c_type}}, ndim=2] out,
411412

412413
@cython.boundscheck(False)
413414
@cython.wraparound(False)
414-
def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
415+
def group_rank_{{name}}(float64_t[:, :] out,
415416
ndarray[{{c_type}}, ndim=2] values,
416417
ndarray[int64_t] labels,
417418
bint is_datetimelike, object ties_method,
@@ -453,8 +454,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
453454
TiebreakEnumType tiebreak
454455
Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0
455456
Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0
456-
ndarray[int64_t] _as
457-
ndarray[float64_t, ndim=2] grp_sizes
457+
int64_t[:] _as
458+
float64_t[:, :] grp_sizes
458459
ndarray[{{c_type}}] masked_vals
459460
ndarray[uint8_t] mask
460461
bint keep_na
@@ -617,7 +618,7 @@ def group_max(ndarray[groupby_t, ndim=2] out,
617618
cdef:
618619
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
619620
groupby_t val, count, nan_val
620-
ndarray[groupby_t, ndim=2] maxx, nobs
621+
groupby_t[:, :] maxx, nobs
621622

622623
assert min_count == -1, "'min_count' only used in add and prod"
623624

@@ -629,10 +630,10 @@ def group_max(ndarray[groupby_t, ndim=2] out,
629630
maxx = np.empty_like(out)
630631
if groupby_t is int64_t:
631632
# Note: evaluated at compile-time
632-
maxx.fill(-_int64_max)
633+
maxx[:] = -_int64_max
633634
nan_val = iNaT
634635
else:
635-
maxx.fill(-np.inf)
636+
maxx[:] = -np.inf
636637
nan_val = NAN
637638

638639
N, K = (<object> values).shape
@@ -685,7 +686,7 @@ def group_min(ndarray[groupby_t, ndim=2] out,
685686
cdef:
686687
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
687688
groupby_t val, count, nan_val
688-
ndarray[groupby_t, ndim=2] minx, nobs
689+
groupby_t[:, :] minx, nobs
689690

690691
assert min_count == -1, "'min_count' only used in add and prod"
691692

@@ -696,10 +697,10 @@ def group_min(ndarray[groupby_t, ndim=2] out,
696697

697698
minx = np.empty_like(out)
698699
if groupby_t is int64_t:
699-
minx.fill(_int64_max)
700+
minx[:] = _int64_max
700701
nan_val = iNaT
701702
else:
702-
minx.fill(np.inf)
703+
minx[:] = np.inf
703704
nan_val = NAN
704705

705706
N, K = (<object> values).shape
@@ -741,7 +742,7 @@ group_min_int64 = group_min["int64_t"]
741742

742743
@cython.boundscheck(False)
743744
@cython.wraparound(False)
744-
def group_cummin(ndarray[groupby_t, ndim=2] out,
745+
def group_cummin(groupby_t[:, :] out,
745746
ndarray[groupby_t, ndim=2] values,
746747
ndarray[int64_t] labels,
747748
bint is_datetimelike):
@@ -751,15 +752,15 @@ def group_cummin(ndarray[groupby_t, ndim=2] out,
751752
cdef:
752753
Py_ssize_t i, j, N, K, size
753754
groupby_t val, mval
754-
ndarray[groupby_t, ndim=2] accum
755+
groupby_t[:, :] accum
755756
int64_t lab
756757

757758
N, K = (<object> values).shape
758759
accum = np.empty_like(values)
759760
if groupby_t is int64_t:
760-
accum.fill(_int64_max)
761+
accum[:] = _int64_max
761762
else:
762-
accum.fill(np.inf)
763+
accum[:] = np.inf
763764

764765
with nogil:
765766
for i in range(N):
@@ -794,7 +795,7 @@ group_cummin_int64 = group_cummin["int64_t"]
794795

795796
@cython.boundscheck(False)
796797
@cython.wraparound(False)
797-
def group_cummax(ndarray[groupby_t, ndim=2] out,
798+
def group_cummax(groupby_t[:, :] out,
798799
ndarray[groupby_t, ndim=2] values,
799800
ndarray[int64_t] labels,
800801
bint is_datetimelike):
@@ -804,15 +805,15 @@ def group_cummax(ndarray[groupby_t, ndim=2] out,
804805
cdef:
805806
Py_ssize_t i, j, N, K, size
806807
groupby_t val, mval
807-
ndarray[groupby_t, ndim=2] accum
808+
groupby_t[:, :] accum
808809
int64_t lab
809810

810811
N, K = (<object> values).shape
811812
accum = np.empty_like(values)
812813
if groupby_t is int64_t:
813-
accum.fill(-_int64_max)
814+
accum[:] = -_int64_max
814815
else:
815-
accum.fill(-np.inf)
816+
accum[:] = -np.inf
816817

817818
with nogil:
818819
for i in range(N):

pandas/_libs/hashtable.pyx

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,8 @@
22

33
cimport cython
44

5-
from cpython cimport (PyObject, Py_INCREF, PyList_Check, PyTuple_Check,
6-
PyMem_Malloc, PyMem_Realloc, PyMem_Free,
7-
PyString_Check, PyBytes_Check,
8-
PyUnicode_Check)
5+
from cpython cimport (PyObject, Py_INCREF,
6+
PyMem_Malloc, PyMem_Realloc, PyMem_Free)
97

108
from libc.stdlib cimport malloc, free
119

@@ -153,7 +151,7 @@ def unique_label_indices(ndarray[int64_t, ndim=1] labels):
153151
cdef:
154152
int ret = 0
155153
Py_ssize_t i, n = len(labels)
156-
kh_int64_t * table = kh_init_int64()
154+
kh_int64_t *table = kh_init_int64()
157155
Int64Vector idx = Int64Vector()
158156
ndarray[int64_t, ndim=1] arr
159157
Int64VectorData *ud = idx.data

0 commit comments

Comments
 (0)