diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index ce0158b05c2ab..b4cad23ee0b5a 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -170,6 +170,8 @@ Performance improvements - Performance improvement in :meth:`.GroupBy.transform` for user-defined functions (:issue:`41598`) - Performance improvement in constructing :class:`DataFrame` objects (:issue:`42631`) - Performance improvement in :meth:`GroupBy.shift` when ``fill_value`` argument is provided (:issue:`26615`) +- Performance improvement in :meth:`DataFrame.corr` for ``method=pearson`` on data without missing values (:issue:`40956`) +- .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index ff46c699c71e7..6c5388a38c345 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -326,8 +326,12 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): Py_ssize_t i, j, xi, yi, N, K bint minpv float64_t[:, ::1] result + # Initialize to None since we only use in the no missing value case + float64_t[::1] means=None, ssqds=None ndarray[uint8_t, ndim=2] mask + bint no_nans int64_t nobs = 0 + float64_t mean, ssqd, val float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy N, K = (mat).shape @@ -339,25 +343,57 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): result = np.empty((K, K), dtype=np.float64) mask = np.isfinite(mat).view(np.uint8) + no_nans = mask.all() + + # Computing the online means and variances is expensive - so if possible we can + # precompute these and avoid repeating the computations each time we handle + # an (xi, yi) pair + if no_nans: + means = np.empty(K, dtype=np.float64) + ssqds = np.empty(K, dtype=np.float64) + + with nogil: + for j in range(K): + ssqd = mean = 0 + for i in range(N): + val = mat[i, j] + dx = val - mean + mean += 1 / (i + 1) * dx + ssqd += (val - mean) * dx + + means[j] = mean + ssqds[j] = ssqd with nogil: for xi in range(K): for yi in range(xi + 1): - # Welford's method for the variance-calculation - # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0 - for i in range(N): - if mask[i, xi] and mask[i, yi]: + covxy = 0 + if no_nans: + for i in range(N): vx = mat[i, xi] vy = mat[i, yi] - nobs += 1 - dx = vx - meanx - dy = vy - meany - meanx += 1 / nobs * dx - meany += 1 / nobs * dy - ssqdmx += (vx - meanx) * dx - ssqdmy += (vy - meany) * dy - covxy += (vx - meanx) * dy + covxy += (vx - means[xi]) * (vy - means[yi]) + + ssqdmx = ssqds[xi] + ssqdmy = ssqds[yi] + nobs = N + + else: + nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0 + for i in range(N): + # Welford's method for the variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + if mask[i, xi] and mask[i, yi]: + vx = mat[i, xi] + vy = mat[i, yi] + nobs += 1 + dx = vx - meanx + dy = vy - meany + meanx += 1 / nobs * dx + meany += 1 / nobs * dy + ssqdmx += (vx - meanx) * dx + ssqdmy += (vy - meany) * dy + covxy += (vx - meanx) * dy if nobs < minpv: result[xi, yi] = result[yi, xi] = NaN