Merge remote-tracking branch 'upstream/main' into td-construction

patrickmckenna · patrickmckenna · commit 613e36592be9 · 2022-05-13T12:57:19.000-05:00
diff --git a/.github/workflows/macos-windows.yml b/.github/workflows/macos-windows.yml
@@ -1,4 +1,4 @@
-name: Windows
+name: Windows-MacOS
 
 on:
   push:
@@ -21,18 +21,20 @@ env:
 
 jobs:
   pytest:
-    runs-on: windows-latest
     defaults:
       run:
         shell: bash -el {0}
     timeout-minutes: 90
     strategy:
       matrix:
+        os: [macos-latest, windows-latest]
         env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml]
       fail-fast: false
+    runs-on: ${{ matrix.os }}
+    name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }}
     concurrency:
       # https://github.community/t/concurrecy-not-work-for-push/183068/7
-      group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-windows
+      group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.os }}
       cancel-in-progress: true
 
     steps:
@@ -47,10 +49,17 @@ jobs:
         mamba-version: "*"
         channels: conda-forge
         activate-environment: pandas-dev
-        channel-priority: strict
+        channel-priority: ${{ matrix.os == 'macos-latest' && 'flexible' || 'strict' }}
         environment-file: ci/deps/${{ matrix.env_file }}
         use-only-tar-bz2: true
 
+    # ImportError: 2): Library not loaded: @rpath/libssl.1.1.dylib
+    # Referenced from: /Users/runner/miniconda3/envs/pandas-dev/lib/libthrift.0.13.0.dylib
+    # Reason: image not found
+    - name: Upgrade pyarrow on MacOS
+      run: conda install -n pandas-dev -c conda-forge --no-update-deps pyarrow=6
+      if: ${{ matrix.os == 'macos-latest' }}
+
     - name: Build Pandas
       uses: ./.github/actions/build_pandas
 
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -22,11 +22,6 @@ variables:
   PANDAS_CI: 1
 
 jobs:
-- template: ci/azure/posix.yml
-  parameters:
-    name: macOS
-    vmImage: macOS-10.15
-
 - job: py38_32bit
   pool:
     vmImage: ubuntu-18.04
diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml
diff --git a/ci/setup_env.sh b/ci/setup_env.sh
@@ -73,15 +73,6 @@ mamba install -n pandas-dev 'setuptools<60'
 echo "conda list -n pandas-dev"
 conda list -n pandas-dev
 
-# From pyarrow on MacOS
-# ImportError: 2): Library not loaded: @rpath/libssl.1.1.dylib
-# Referenced from: /Users/runner/miniconda3/envs/pandas-dev/lib/libthrift.0.13.0.dylib
-# Reason: image not found
-if [[ "$(uname)" == 'Darwin' ]]; then
-    echo "Update pyarrow for pyarrow on MacOS"
-    conda install -n pandas-dev -c conda-forge --no-update-deps pyarrow=6
-fi
-
 if [[ "$BITS32" == "yes" ]]; then
     # activate 32-bit compiler
     export CONDA_BUILD=1
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -732,6 +732,7 @@ Groupby/resample/rolling
 - Bug in :meth:`SeriesGroupBy.apply` would incorrectly name its result when there was a unique group (:issue:`46369`)
 - Bug in :meth:`Rolling.sum` and :meth:`Rolling.mean` would give incorrect result with window of same values (:issue:`42064`, :issue:`46431`)
 - Bug in :meth:`Rolling.var` and :meth:`Rolling.std` would give non-zero result with window of same values (:issue:`42064`)
+- Bug in :meth:`Rolling.skew` and :meth:`Rolling.kurt` would give NaN with window of same values (:issue:`30993`)
 - Bug in :meth:`.Rolling.var` would segfault calculating weighted variance when window size was larger than data size (:issue:`46760`)
 - Bug in :meth:`Grouper.__repr__` where ``dropna`` was not included. Now it is (:issue:`46754`)
 - Bug in :meth:`DataFrame.rolling` gives ValueError when center=True, axis=1 and win_type is specified (:issue:`46135`)
diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
@@ -455,8 +455,9 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
 
 
 cdef inline float64_t calc_skew(int64_t minp, int64_t nobs,
-                                float64_t x, float64_t xx,
-                                float64_t xxx) nogil:
+                                float64_t x, float64_t xx, float64_t xxx,
+                                int64_t num_consecutive_same_value
+                                ) nogil:
     cdef:
         float64_t result, dnobs
         float64_t A, B, C, R
@@ -467,6 +468,12 @@ cdef inline float64_t calc_skew(int64_t minp, int64_t nobs,
         B = xx / dnobs - A * A
         C = xxx / dnobs - A * A * A - 3 * A * B
 
+        if nobs < 3:
+            result = NaN
+        # GH 42064 46431
+        # uniform case, force result to be 0
+        elif num_consecutive_same_value >= nobs:
+            result = 0.0
         # #18044: with uniform distribution, floating issue will
         #         cause B != 0. and cause the result is a very
         #         large number.
@@ -476,7 +483,7 @@ cdef inline float64_t calc_skew(int64_t minp, int64_t nobs,
         #         if the variance is less than 1e-14, it could be
         #         treat as zero, here we follow the original
         #         skew/kurt behaviour to check B <= 1e-14
-        if B <= 1e-14 or nobs < 3:
+        elif B <= 1e-14:
             result = NaN
         else:
             R = sqrt(B)
@@ -493,7 +500,10 @@ cdef inline void add_skew(float64_t val, int64_t *nobs,
                           float64_t *xxx,
                           float64_t *compensation_x,
                           float64_t *compensation_xx,
-                          float64_t *compensation_xxx) nogil:
+                          float64_t *compensation_xxx,
+                          int64_t *num_consecutive_same_value,
+                          float64_t *prev_value,
+                          ) nogil:
     """ add a value from the skew calc """
     cdef:
         float64_t y, t
@@ -515,6 +525,14 @@ cdef inline void add_skew(float64_t val, int64_t *nobs,
         compensation_xxx[0] = t - xxx[0] - y
         xxx[0] = t
 
+        # GH#42064, record num of same values to remove floating point artifacts
+        if val == prev_value[0]:
+            num_consecutive_same_value[0] += 1
+        else:
+            # reset to 1 (include current value itself)
+            num_consecutive_same_value[0] = 1
+        prev_value[0] = val
+
 
 cdef inline void remove_skew(float64_t val, int64_t *nobs,
                              float64_t *x, float64_t *xx,
@@ -553,8 +571,9 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
         float64_t compensation_xx_add, compensation_xx_remove
         float64_t compensation_x_add, compensation_x_remove
         float64_t x, xx, xxx
+        float64_t prev_value
         int64_t nobs = 0, N = len(start), V = len(values), nobs_mean = 0
-        int64_t s, e
+        int64_t s, e, num_consecutive_same_value
         ndarray[float64_t] output, mean_array, values_copy
         bint is_monotonic_increasing_bounds
 
@@ -588,6 +607,9 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
             # never removed
             if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
 
+                prev_value = values[s]
+                num_consecutive_same_value = 0
+
                 compensation_xxx_add = compensation_xxx_remove = 0
                 compensation_xx_add = compensation_xx_remove = 0
                 compensation_x_add = compensation_x_remove = 0
@@ -596,7 +618,8 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
                 for j in range(s, e):
                     val = values_copy[j]
                     add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
-                             &compensation_xx_add, &compensation_xxx_add)
+                             &compensation_xx_add, &compensation_xxx_add,
+                             &num_consecutive_same_value, &prev_value)
 
             else:
 
@@ -612,9 +635,10 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
                 for j in range(end[i - 1], e):
                     val = values_copy[j]
                     add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
-                             &compensation_xx_add, &compensation_xxx_add)
+                             &compensation_xx_add, &compensation_xxx_add,
+                             &num_consecutive_same_value, &prev_value)
 
-            output[i] = calc_skew(minp, nobs, x, xx, xxx)
+            output[i] = calc_skew(minp, nobs, x, xx, xxx, num_consecutive_same_value)
 
             if not is_monotonic_increasing_bounds:
                 nobs = 0
@@ -630,35 +654,44 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
 
 cdef inline float64_t calc_kurt(int64_t minp, int64_t nobs,
                                 float64_t x, float64_t xx,
-                                float64_t xxx, float64_t xxxx) nogil:
+                                float64_t xxx, float64_t xxxx,
+                                int64_t num_consecutive_same_value,
+                                ) nogil:
     cdef:
         float64_t result, dnobs
         float64_t A, B, C, D, R, K
 
     if nobs >= minp:
-        dnobs = <float64_t>nobs
-        A = x / dnobs
-        R = A * A
-        B = xx / dnobs - R
-        R = R * A
-        C = xxx / dnobs - R - 3 * A * B
-        R = R * A
-        D = xxxx / dnobs - R - 6 * B * A * A - 4 * C * A
-
-        # #18044: with uniform distribution, floating issue will
-        #         cause B != 0. and cause the result is a very
-        #         large number.
-        #
-        #         in core/nanops.py nanskew/nankurt call the function
-        #         _zero_out_fperr(m2) to fix floating error.
-        #         if the variance is less than 1e-14, it could be
-        #         treat as zero, here we follow the original
-        #         skew/kurt behaviour to check B <= 1e-14
-        if B <= 1e-14 or nobs < 4:
+        if nobs < 4:
             result = NaN
+        # GH 42064 46431
+        # uniform case, force result to be -3.
+        elif num_consecutive_same_value >= nobs:
+            result = -3.
         else:
-            K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2)
-            result = K / ((dnobs - 2.) * (dnobs - 3.))
+            dnobs = <float64_t>nobs
+            A = x / dnobs
+            R = A * A
+            B = xx / dnobs - R
+            R = R * A
+            C = xxx / dnobs - R - 3 * A * B
+            R = R * A
+            D = xxxx / dnobs - R - 6 * B * A * A - 4 * C * A
+
+            # #18044: with uniform distribution, floating issue will
+            #         cause B != 0. and cause the result is a very
+            #         large number.
+            #
+            #         in core/nanops.py nanskew/nankurt call the function
+            #         _zero_out_fperr(m2) to fix floating error.
+            #         if the variance is less than 1e-14, it could be
+            #         treat as zero, here we follow the original
+            #         skew/kurt behaviour to check B <= 1e-14
+            if B <= 1e-14:
+                result = NaN
+            else:
+                K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2)
+                result = K / ((dnobs - 2.) * (dnobs - 3.))
     else:
         result = NaN
 
@@ -671,7 +704,10 @@ cdef inline void add_kurt(float64_t val, int64_t *nobs,
                           float64_t *compensation_x,
                           float64_t *compensation_xx,
                           float64_t *compensation_xxx,
-                          float64_t *compensation_xxxx) nogil:
+                          float64_t *compensation_xxxx,
+                          int64_t *num_consecutive_same_value,
+                          float64_t *prev_value
+                          ) nogil:
     """ add a value from the kurotic calc """
     cdef:
         float64_t y, t
@@ -697,6 +733,14 @@ cdef inline void add_kurt(float64_t val, int64_t *nobs,
         compensation_xxxx[0] = t - xxxx[0] - y
         xxxx[0] = t
 
+        # GH#42064, record num of same values to remove floating point artifacts
+        if val == prev_value[0]:
+            num_consecutive_same_value[0] += 1
+        else:
+            # reset to 1 (include current value itself)
+            num_consecutive_same_value[0] = 1
+        prev_value[0] = val
+
 
 cdef inline void remove_kurt(float64_t val, int64_t *nobs,
                              float64_t *x, float64_t *xx,
@@ -741,7 +785,9 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
         float64_t compensation_xx_remove, compensation_xx_add
         float64_t compensation_x_remove, compensation_x_add
         float64_t x, xx, xxx, xxxx
-        int64_t nobs, s, e, N = len(start), V = len(values), nobs_mean = 0
+        float64_t prev_value
+        int64_t nobs, s, e, num_consecutive_same_value
+        int64_t N = len(start), V = len(values), nobs_mean = 0
         ndarray[float64_t] output, values_copy
         bint is_monotonic_increasing_bounds
 
@@ -775,6 +821,9 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
             # never removed
             if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
 
+                prev_value = values[s]
+                num_consecutive_same_value = 0
+
                 compensation_xxxx_add = compensation_xxxx_remove = 0
                 compensation_xxx_remove = compensation_xxx_add = 0
                 compensation_xx_remove = compensation_xx_add = 0
@@ -784,7 +833,8 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
                 for j in range(s, e):
                     add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
                              &compensation_x_add, &compensation_xx_add,
-                             &compensation_xxx_add, &compensation_xxxx_add)
+                             &compensation_xxx_add, &compensation_xxxx_add,
+                             &num_consecutive_same_value, &prev_value)
 
             else:
 
@@ -800,9 +850,10 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
                 for j in range(end[i - 1], e):
                     add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
                              &compensation_x_add, &compensation_xx_add,
-                             &compensation_xxx_add, &compensation_xxxx_add)
+                             &compensation_xxx_add, &compensation_xxxx_add,
+                             &num_consecutive_same_value, &prev_value)
 
-            output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx)
+            output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx, num_consecutive_same_value)
 
             if not is_monotonic_increasing_bounds:
                 nobs = 0
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
@@ -1860,3 +1860,14 @@ def test_rolling_mean_sum_floating_artifacts():
     assert (result[-3:] == 0).all()
     result = r.sum()
     assert (result[-3:] == 0).all()
+
+
+def test_rolling_skew_kurt_floating_artifacts():
+    # GH 42064 46431
+
+    sr = Series([1 / 3, 4, 0, 0, 0, 0, 0])
+    r = sr.rolling(4)
+    result = r.skew()
+    assert (result[-2:] == 0).all()
+    result = r.kurt()
+    assert (result[-2:] == -3).all()
diff --git a/pandas/tests/window/test_rolling_skew_kurt.py b/pandas/tests/window/test_rolling_skew_kurt.py