try using cache arguments to keep vectorization

simonariddell · simonariddell · commit 849fac48dc38 · 2018-06-03T17:55:11.000-07:00
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -3935,10 +3935,38 @@ def _apply_to_column_groupbys(self, func):
         return func(self)
 
     def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None):
-        """Calcuate pct_change of each value to previous entry in group"""
-        return self.apply(lambda x: x.pct_change(periods=periods,
-                                                 fill_method=fill_method,
-                                                 limit=limit, freq=freq))
+        """Calculate pct_change of each value to previous entry in group"""
+        grouper = self.grouper
+        cache_exist = getattr(grouper, '_cache', False)
+        if cache_exist:
+            in_cache = True if 'is_monotonic' in cache_exist.keys() else False
+        else:
+            in_cache = False
+        m = grouper.is_monotonic if in_cache else False
+        if not m or fill_method is None:
+            return self.apply(lambda x: x.pct_change(periods=periods,
+                                                     fill_method=fill_method,
+                                                     limit=limit, freq=freq))
+
+        def get_invalid_index(x):
+            if periods == 0:
+                return x
+            elif periods > 0:
+                ax = Index(np.arange(min(x), min(x) + periods))
+                return ax
+            elif periods < 0:
+                ax = Index(np.arange(max(x), max(x) + periods, -1))
+                return ax
+
+        filled = getattr(self, fill_method)(limit=limit)
+        shifted = filled.shift(periods=periods, freq=freq)
+        pct_change = (filled / shifted) - 1
+
+        invalid_index = Index([])
+        for i in [get_invalid_index(v) for k, v in self.indices.items()]:
+            invalid_index = invalid_index.union(i)
+        pct_change.iloc[invalid_index] = np.nan
+        return pct_change
 
 
 class NDFrameGroupBy(GroupBy):
diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py
@@ -723,30 +723,37 @@ def interweave(list_obj):
 
 @pytest.mark.parametrize("test_series", [True, False])
 @pytest.mark.parametrize("shuffle", [True, False])
+@pytest.mark.parametrize("activate_cache", [True, False])
 @pytest.mark.parametrize("periods,fill_method,limit", [
     (1, 'ffill', None), (1, 'ffill', 1),
     (1, 'bfill', None), (1, 'bfill', 1),
     (-1, 'ffill', None), (-1, 'ffill', 1),
-    (-1, 'bfill', None), (-1, 'bfill', 1)])
-def test_pct_change(test_series, shuffle, periods, fill_method, limit):
-    vals = [3, np.nan, 1, 2, 4, 10, np.nan, np.nan]
+    (-1, 'bfill', None), (-1, 'bfill', 1),
+    (-1, None, None), (-1, None, 1),
+    (-1, None, None), (-1, None, 1)
+])
+def test_pct_change(test_series, shuffle, activate_cache, periods, fill_method, limit):
+    vals = [3, np.nan, 1, 2, 4, 10, np.nan, 9]
     keys = ['a', 'b']
-    key_v = [k for j in list(map(lambda x: [x] * len(vals), keys)) for k in j]
+    key_v = np.repeat(keys, len(vals))
     df = DataFrame({'key': key_v, 'vals': vals * 2})
     if shuffle:
         order = np.random.RandomState(seed=42).permutation(len(df))
         df = df.reindex(order).reset_index(drop=True)
 
     manual_apply = []
     for k in keys:
-        subgroup = Series(df.loc[df.key == k, 'vals'].values)
-        manual_apply.append(subgroup.pct_change(periods=periods,
-                                                fill_method=fill_method,
-                                                limit=limit))
-    exp_vals = pd.concat(manual_apply).reset_index(drop=True)
-    exp = pd.DataFrame(exp_vals, columns=['A'])
+        ind = df.loc[df.key == k, 'vals']
+        manual_apply.append(ind.pct_change(periods=periods,
+                                           fill_method=fill_method,
+                                           limit=limit))
+    exp_vals = pd.concat(manual_apply, ignore_index=True)
+    exp = pd.DataFrame(exp_vals.values, columns=['A'])
     grp = df.groupby('key')
 
+    if activate_cache:
+        grp.grouper.is_monotonic
+
     def get_result(grp_obj):
         return grp_obj.pct_change(periods=periods,
                                   fill_method=fill_method,
@@ -763,7 +770,7 @@ def get_result(grp_obj):
         tm.assert_series_equal(result, exp)
     else:
         result = get_result(grp)
-        result.reset_index(drop=True, inplace=True)
+        result = result.reset_index(drop=True)
         result.columns = ['A']
         tm.assert_frame_equal(result, exp)