Merge pull request #6955 from TomAugspurger/quantiles

Tom Augspurger · Tom Augspurger · commit c6730eaca478 · 2014-04-25T06:55:05.000-05:00
ENH: Quantiles accepts an array
diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt
@@ -482,6 +482,8 @@ Enhancements
 - Added ``how`` option to rolling-moment functions to dictate how to handle resampling; :func:``rolling_max`` defaults to max,
   :func:``rolling_min`` defaults to min, and all others default to mean (:issue:`6297`)
 - ``CustomBuisnessMonthBegin`` and ``CustomBusinessMonthEnd`` are now available (:issue:`6866`)
+- :meth:`Series.quantile` and :meth:`DataFrame.quantile` now accept an array of 
+  quantiles.
 
 Performance
 ~~~~~~~~~~~
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4145,22 +4145,41 @@ def mode(self, axis=0, numeric_only=False):
     def quantile(self, q=0.5, axis=0, numeric_only=True):
         """
         Return values at the given quantile over requested axis, a la
-        scoreatpercentile in scipy.stats
+        numpy.percentile.
 
         Parameters
         ----------
-        q : quantile, default 0.5 (50% quantile)
-            0 <= q <= 1
+        q : float or array-like, default 0.5 (50% quantile)
+            0 <= q <= 1, the quantile(s) to compute
         axis : {0, 1}
             0 for row-wise, 1 for column-wise
 
         Returns
         -------
-        quantiles : Series
+        quantiles : Series or DataFrame
+            If ``q`` is an array, a DataFrame will be returned where the
+            index is ``q``, the columns are the columns of self, and the
+            values are the quantiles.
+            If ``q`` is a float, a Series will be returned where the
+            index is the columns of self and the values are the quantiles.
+
+        Examples
+        --------
+
+        >>> df = DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
+                          columns=['a', 'b'])
+        >>> df.quantile(.1)
+        a    1.3
+        b    3.7
+        dtype: float64
+        >>> df.quantile([.1, .5])
+               a     b
+        0.1  1.3   3.7
+        0.5  2.5  55.0
         """
-        per = q * 100
+        per = np.asarray(q) * 100
 
-        def f(arr):
+        def f(arr, per):
             arr = arr.values
             if arr.dtype != np.float_:
                 arr = arr.astype(float)
@@ -4171,7 +4190,12 @@ def f(arr):
                 return _quantile(arr, per)
 
         data = self._get_numeric_data() if numeric_only else self
-        return data.apply(f, axis=axis)
+        if com.is_list_like(per):
+            from pandas.tools.merge import concat
+            return concat([data.apply(f, axis=axis, args=(x,)) for x in per],
+                          axis=1, keys=per/100.).T
+        else:
+            return data.apply(f, axis=axis, args=(per,))
 
     def rank(self, axis=0, numeric_only=None, method='average',
              na_option='keep', ascending=True, pct=False):
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1220,26 +1220,51 @@ def round(self, decimals=0, out=None):
 
     def quantile(self, q=0.5):
         """
-        Return value at the given quantile, a la scoreatpercentile in
-        scipy.stats
+        Return value at the given quantile, a la numpy.percentile.
 
         Parameters
         ----------
-        q : quantile
-            0 <= q <= 1
+        q : float or array-like, default 0.5 (50% quantile)
+            0 <= q <= 1, the quantile(s) to compute
 
         Returns
         -------
-        quantile : float
+        quantile : float or Series
+            if ``q`` is an array, a Series will be returned where the
+            index is ``q`` and the values are the quantiles.
+
+        Examples
+        --------
+
+        >>> s = Series([1, 2, 3, 4])
+        >>> s.quantile(.5)
+            2.5
+        >>> s.quantile([.25, .5, .75])
+        0.25    1.75
+        0.50    2.50
+        0.75    3.25
+        dtype: float64
         """
         valid_values = self.dropna().values
         if len(valid_values) == 0:
             return pa.NA
+
+        def multi(values, qs):
+            if com.is_list_like(qs):
+                return Series([_quantile(values, x*100)
+                               for x in qs], index=qs)
+            else:
+                return _quantile(values, qs*100)
+
         if com.is_datetime64_dtype(self):
             values = _values_from_object(self).view('i8')
-            result = lib.Timestamp(_quantile(values, q * 100))
+            result = multi(values, q)
+            if com.is_list_like(q):
+                result = result.map(lib.Timestamp)
+            else:
+                result = lib.Timestamp(result)
         else:
-            result = _quantile(valid_values, q * 100)
+            result = multi(valid_values, q)
 
         return result
 
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -10945,6 +10945,25 @@ def test_quantile(self):
         xp = df.median()
         assert_series_equal(rs, xp)
 
+    def test_quantile_multi(self):
+        df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
+                       columns=['a', 'b', 'c'])
+        result = df.quantile([.25, .5])
+        expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]],
+                             index=[.25, .5], columns=['a', 'b', 'c'])
+        assert_frame_equal(result, expected)
+
+        # axis = 1
+        result = df.quantile([.25, .5], axis=1)
+        expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]],
+                             index=[.25, .5], columns=[0, 1, 2])
+
+        # empty
+        result = DataFrame({'x': [], 'y': []}).quantile([0.1, .9], axis=0)
+        expected = DataFrame({'x': [np.nan, np.nan], 'y': [np.nan, np.nan]},
+                             index=[.1, .9])
+        assert_frame_equal(result, expected)
+
     def test_cumsum(self):
         self.tsframe.ix[5:10, 0] = nan
         self.tsframe.ix[10:15, 1] = nan
@@ -12728,7 +12747,6 @@ def check_query_with_unnamed_multiindex(self, parser, engine):
         df = DataFrame(randn(10, 2), index=index)
         ind = Series(df.index.get_level_values(0).values, index=index)
 
-        #import ipdb; ipdb.set_trace()
         res1 = df.query('ilevel_0 == "red"', parser=parser, engine=engine)
         res2 = df.query('"red" == ilevel_0', parser=parser, engine=engine)
         exp = df[ind == 'red']
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -2203,6 +2203,22 @@ def test_quantile(self):
             q = tds.quantile(.25)
             self.assertEqual(q, pd.to_timedelta('24:00:00'))
 
+    def test_quantile_multi(self):
+        from numpy import percentile
+
+        qs = [.1, .9]
+        result = self.ts.quantile(qs)
+        expected = pd.Series([percentile(self.ts.valid(), 10),
+                              percentile(self.ts.valid(), 90)],
+                             index=qs)
+        assert_series_equal(result, expected)
+
+        dts = self.ts.index.to_series()
+        result = dts.quantile((.2, .2))
+        assert_series_equal(result, Series([Timestamp('2000-01-10 19:12:00'),
+                                            Timestamp('2000-01-10 19:12:00')],
+                                           index=[.2, .2]))
+
     def test_describe(self):
         _ = self.series.describe()
         _ = self.ts.describe()