Central moments (#23)

LukeMathWalker · web-flow · commit 9e04bc0989e6 · 2019-04-06T18:30:44.000+01:00
* Added definition of central order moment

* Added stubbed implementation

* Add behaviour to return None if the array is empty

* Handle edge cases

* Added test

* First implementation completed

* Added test for second order moment (variance)

* Implemented test and renamed to central_moment

* Using Horner's method for evaluating polynomials

* Add algorithm notes to the docs

* Added algorithmic complexity to the docs

* Added two optimizations

* Added signature for bulk central moment computation

* Fixed trait signature

* Indent

* Factored out logic from central_moment

* Implemented central_moments

* Test implementation correctness

* Added skewness and kurtosis

* Added docs

* Added tests for kurtosis and skewness

* Fixed assertions

* No need to get to order 4 for skewness

* Fixed kurtosis test

* Enriched docs

* Fmt

* Avoid computing mean for p=0,1 central moments

* Replace map + clone with mapv

* Check for moment order overflowing `i32`

In reality, this probably isn't necessary because I'd expect someone
to terminate their program when trying to calculate a moment of order
greater than `i32::MAX` (because it would take so long), but it's nice
to have an explicit check anyway.

* Take advantage of IterBinomial from num-integer

* Remove unnecessary explicit clones

`A: Float` requires `A: Copy`.

* Replace .map() with ?

I think this is a bit easier to understand at first glance.

* Test more cases in central moment tests

* Rename central moment tests

* Push diff to the lowest possible exponent

* Return a Result instead of Option, for consistency

* Match impl with trait definition

* Match test to trait+impl

* Fmt

* Converted order to u16

* Fmt

* Fix typo. Change casting to use as.

* Fix typo. Change casting to use as.

* Fix typo. Change casting to use as.
diff --git a/Cargo.toml b/Cargo.toml
@@ -17,6 +17,7 @@ categories = ["data-structures", "science"]
 [dependencies]
 ndarray = "0.12.1"
 noisy_float = "0.1.8"
+num-integer = "0.1"
 num-traits = "0.2"
 rand = "0.6"
 itertools = { version = "0.7.0", default-features = false }
diff --git a/src/lib.rs b/src/lib.rs
@@ -28,6 +28,7 @@
 extern crate itertools;
 extern crate ndarray;
 extern crate noisy_float;
+extern crate num_integer;
 extern crate num_traits;
 extern crate rand;
 
diff --git a/src/summary_statistics/means.rs b/src/summary_statistics/means.rs
@@ -1,6 +1,7 @@
 use super::SummaryStatisticsExt;
 use crate::errors::EmptyInput;
 use ndarray::{ArrayBase, Data, Dimension};
+use num_integer::IterBinomial;
 use num_traits::{Float, FromPrimitive, Zero};
 use std::ops::{Add, Div};
 
@@ -36,15 +37,161 @@ where
     {
         self.map(|x| x.ln()).mean().map(|x| x.exp())
     }
+
+    fn kurtosis(&self) -> Result<A, EmptyInput>
+    where
+        A: Float + FromPrimitive,
+    {
+        let central_moments = self.central_moments(4)?;
+        Ok(central_moments[4] / central_moments[2].powi(2))
+    }
+
+    fn skewness(&self) -> Result<A, EmptyInput>
+    where
+        A: Float + FromPrimitive,
+    {
+        let central_moments = self.central_moments(3)?;
+        Ok(central_moments[3] / central_moments[2].sqrt().powi(3))
+    }
+
+    fn central_moment(&self, order: u16) -> Result<A, EmptyInput>
+    where
+        A: Float + FromPrimitive,
+    {
+        if self.is_empty() {
+            return Err(EmptyInput);
+        }
+        match order {
+            0 => Ok(A::one()),
+            1 => Ok(A::zero()),
+            n => {
+                let mean = self.mean().unwrap();
+                let shifted_array = self.mapv(|x| x - mean);
+                let shifted_moments = moments(shifted_array, n);
+                let correction_term = -shifted_moments[1];
+
+                let coefficients = central_moment_coefficients(&shifted_moments);
+                Ok(horner_method(coefficients, correction_term))
+            }
+        }
+    }
+
+    fn central_moments(&self, order: u16) -> Result<Vec<A>, EmptyInput>
+    where
+        A: Float + FromPrimitive,
+    {
+        if self.is_empty() {
+            return Err(EmptyInput);
+        }
+        match order {
+            0 => Ok(vec![A::one()]),
+            1 => Ok(vec![A::one(), A::zero()]),
+            n => {
+                // We only perform these operations once, and then reuse their
+                // result to compute all the required moments
+                let mean = self.mean().unwrap();
+                let shifted_array = self.mapv(|x| x - mean);
+                let shifted_moments = moments(shifted_array, n);
+                let correction_term = -shifted_moments[1];
+
+                let mut central_moments = vec![A::one(), A::zero()];
+                for k in 2..=n {
+                    let coefficients =
+                        central_moment_coefficients(&shifted_moments[..=(k as usize)]);
+                    let central_moment = horner_method(coefficients, correction_term);
+                    central_moments.push(central_moment)
+                }
+                Ok(central_moments)
+            }
+        }
+    }
+}
+
+/// Returns a vector containing all moments of the array elements up to
+/// *order*, where the *p*-th moment is defined as:
+///
+/// ```text
+/// 1  n
+/// ―  ∑ xᵢᵖ
+/// n i=1
+/// ```
+///
+/// The returned moments are ordered by power magnitude: 0th moment, 1st moment, etc.
+///
+/// **Panics** if `A::from_usize()` fails to convert the number of elements in the array.
+fn moments<A, S, D>(a: ArrayBase<S, D>, order: u16) -> Vec<A>
+where
+    A: Float + FromPrimitive,
+    S: Data<Elem = A>,
+    D: Dimension,
+{
+    let n_elements =
+        A::from_usize(a.len()).expect("Converting number of elements to `A` must not fail");
+    let order = order as i32;
+
+    // When k=0, we are raising each element to the 0th power
+    // No need to waste CPU cycles going through the array
+    let mut moments = vec![A::one()];
+
+    if order >= 1 {
+        // When k=1, we don't need to raise elements to the 1th power (identity)
+        moments.push(a.sum() / n_elements)
+    }
+
+    for k in 2..=order {
+        moments.push(a.map(|x| x.powi(k)).sum() / n_elements)
+    }
+    moments
+}
+
+/// Returns the coefficients in the polynomial expression to compute the *p*th
+/// central moment as a function of the sample mean.
+///
+/// It takes as input all moments up to order *p*, ordered by power magnitude - *p* is
+/// inferred to be the length of the *moments* array.
+fn central_moment_coefficients<A>(moments: &[A]) -> Vec<A>
+where
+    A: Float + FromPrimitive,
+{
+    let order = moments.len();
+    IterBinomial::new(order)
+        .zip(moments.iter().rev())
+        .map(|(binom, &moment)| A::from_usize(binom).unwrap() * moment)
+        .collect()
+}
+
+/// Uses [Horner's method] to evaluate a polynomial with a single indeterminate.
+///
+/// Coefficients are expected to be sorted by ascending order
+/// with respect to the indeterminate's exponent.
+///
+/// If the array is empty, `A::zero()` is returned.
+///
+/// Horner's method can evaluate a polynomial of order *n* with a single indeterminate
+/// using only *n-1* multiplications and *n-1* sums - in terms of number of operations,
+/// this is an optimal algorithm for polynomial evaluation.
+///
+/// [Horner's method]: https://en.wikipedia.org/wiki/Horner%27s_method
+fn horner_method<A>(coefficients: Vec<A>, indeterminate: A) -> A
+where
+    A: Float,
+{
+    let mut result = A::zero();
+    for coefficient in coefficients.into_iter().rev() {
+        result = coefficient + indeterminate * result
+    }
+    result
 }
 
 #[cfg(test)]
 mod tests {
     use super::SummaryStatisticsExt;
     use crate::errors::EmptyInput;
-    use approx::abs_diff_eq;
-    use ndarray::{array, Array1};
+    use approx::assert_abs_diff_eq;
+    use ndarray::{array, Array, Array1};
+    use ndarray_rand::RandomExt;
     use noisy_float::types::N64;
+    use rand::distributions::Uniform;
     use std::f64;
 
     #[test]
@@ -90,16 +237,116 @@ mod tests {
         // Computed using SciPy
         let expected_geometric_mean = 0.4345897639796527;
 
-        abs_diff_eq!(a.mean().unwrap(), expected_mean, epsilon = f64::EPSILON);
-        abs_diff_eq!(
+        assert_abs_diff_eq!(a.mean().unwrap(), expected_mean, epsilon = 1e-9);
+        assert_abs_diff_eq!(
             a.harmonic_mean().unwrap(),
             expected_harmonic_mean,
-            epsilon = f64::EPSILON
+            epsilon = 1e-7
         );
-        abs_diff_eq!(
+        assert_abs_diff_eq!(
             a.geometric_mean().unwrap(),
             expected_geometric_mean,
-            epsilon = f64::EPSILON
+            epsilon = 1e-12
         );
     }
+
+    #[test]
+    fn test_central_moment_with_empty_array_of_floats() {
+        let a: Array1<f64> = array![];
+        for order in 0..=3 {
+            assert_eq!(a.central_moment(order), Err(EmptyInput));
+            assert_eq!(a.central_moments(order), Err(EmptyInput));
+        }
+    }
+
+    #[test]
+    fn test_zeroth_central_moment_is_one() {
+        let n = 50;
+        let bound: f64 = 200.;
+        let a = Array::random(n, Uniform::new(-bound.abs(), bound.abs()));
+        assert_eq!(a.central_moment(0).unwrap(), 1.);
+    }
+
+    #[test]
+    fn test_first_central_moment_is_zero() {
+        let n = 50;
+        let bound: f64 = 200.;
+        let a = Array::random(n, Uniform::new(-bound.abs(), bound.abs()));
+        assert_eq!(a.central_moment(1).unwrap(), 0.);
+    }
+
+    #[test]
+    fn test_central_moments() {
+        let a: Array1<f64> = array![
+            0.07820559, 0.5026185, 0.80935324, 0.39384033, 0.9483038, 0.62516215, 0.90772261,
+            0.87329831, 0.60267392, 0.2960298, 0.02810356, 0.31911966, 0.86705506, 0.96884832,
+            0.2222465, 0.42162446, 0.99909868, 0.47619762, 0.91696979, 0.9972741, 0.09891734,
+            0.76934818, 0.77566862, 0.7692585, 0.2235759, 0.44821286, 0.79732186, 0.04804275,
+            0.87863238, 0.1111003, 0.6653943, 0.44386445, 0.2133176, 0.39397086, 0.4374617,
+            0.95896624, 0.57850146, 0.29301706, 0.02329879, 0.2123203, 0.62005503, 0.996492,
+            0.5342986, 0.97822099, 0.5028445, 0.6693834, 0.14256682, 0.52724704, 0.73482372,
+            0.1809703,
+        ];
+        // Computed using scipy.stats.moment
+        let expected_moments = vec![
+            1.,
+            0.,
+            0.09339920262960291,
+            -0.0026849636727735186,
+            0.015403769257729755,
+            -0.001204176487006564,
+            0.002976822584939186,
+        ];
+        for (order, expected_moment) in expected_moments.iter().enumerate() {
+            assert_abs_diff_eq!(
+                a.central_moment(order as u16).unwrap(),
+                expected_moment,
+                epsilon = 1e-8
+            );
+        }
+    }
+
+    #[test]
+    fn test_bulk_central_moments() {
+        // Test that the bulk method is coherent with the non-bulk method
+        let n = 50;
+        let bound: f64 = 200.;
+        let a = Array::random(n, Uniform::new(-bound.abs(), bound.abs()));
+        let order = 10;
+        let central_moments = a.central_moments(order).unwrap();
+        for i in 0..=order {
+            assert_eq!(a.central_moment(i).unwrap(), central_moments[i as usize]);
+        }
+    }
+
+    #[test]
+    fn test_kurtosis_and_skewness_is_none_with_empty_array_of_floats() {
+        let a: Array1<f64> = array![];
+        assert_eq!(a.skewness(), Err(EmptyInput));
+        assert_eq!(a.kurtosis(), Err(EmptyInput));
+    }
+
+    #[test]
+    fn test_kurtosis_and_skewness() {
+        let a: Array1<f64> = array![
+            0.33310096, 0.98757449, 0.9789796, 0.96738114, 0.43545674, 0.06746873, 0.23706562,
+            0.04241815, 0.38961714, 0.52421271, 0.93430327, 0.33911604, 0.05112372, 0.5013455,
+            0.05291507, 0.62511183, 0.20749633, 0.22132433, 0.14734804, 0.51960608, 0.00449208,
+            0.4093339, 0.2237519, 0.28070469, 0.7887231, 0.92224523, 0.43454188, 0.18335111,
+            0.08646856, 0.87979847, 0.25483457, 0.99975627, 0.52712442, 0.41163279, 0.85162594,
+            0.52618733, 0.75815023, 0.30640695, 0.14205781, 0.59695813, 0.851331, 0.39524328,
+            0.73965373, 0.4007615, 0.02133069, 0.92899207, 0.79878191, 0.38947334, 0.22042183,
+            0.77768353,
+        ];
+        // Computed using scipy.stats.kurtosis(a, fisher=False)
+        let expected_kurtosis = 1.821933711687523;
+        // Computed using scipy.stats.skew
+        let expected_skewness = 0.2604785422878771;
+
+        let kurtosis = a.kurtosis().unwrap();
+        let skewness = a.skewness().unwrap();
+
+        assert_abs_diff_eq!(kurtosis, expected_kurtosis, epsilon = 1e-12);
+        assert_abs_diff_eq!(skewness, expected_skewness, epsilon = 1e-8);
+    }
 }
diff --git a/src/summary_statistics/mod.rs b/src/summary_statistics/mod.rs
@@ -61,6 +61,85 @@ where
     fn geometric_mean(&self) -> Result<A, EmptyInput>
     where
         A: Float + FromPrimitive;
+
+    /// Returns the [kurtosis] `Kurt[X]` of all elements in the array:
+    ///
+    /// ```text
+    /// Kurt[X] = μ₄ / σ⁴
+    /// ```
+    ///
+    /// where μ₄ is the fourth central moment and σ is the standard deviation of
+    /// the elements in the array.
+    ///
+    /// This is sometimes referred to as _Pearson's kurtosis_. Fisher's kurtosis can be
+    /// computed by subtracting 3 from Pearson's kurtosis.
+    ///
+    /// If the array is empty, `Err(EmptyInput)` is returned.
+    ///
+    /// **Panics** if `A::from_usize()` fails to convert the number of elements in the array.
+    ///
+    /// [kurtosis]: https://en.wikipedia.org/wiki/Kurtosis
+    fn kurtosis(&self) -> Result<A, EmptyInput>
+    where
+        A: Float + FromPrimitive;
+
+    /// Returns the [Pearson's moment coefficient of skewness] γ₁ of all elements in the array:
+    ///
+    /// ```text
+    /// γ₁ = μ₃ / σ³
+    /// ```
+    ///
+    /// where μ₃ is the third central moment and σ is the standard deviation of
+    /// the elements in the array.
+    ///
+    /// If the array is empty, `Err(EmptyInput)` is returned.
+    ///
+    /// **Panics** if `A::from_usize()` fails to convert the number of elements in the array.
+    ///
+    /// [Pearson's moment coefficient of skewness]: https://en.wikipedia.org/wiki/Skewness
+    fn skewness(&self) -> Result<A, EmptyInput>
+    where
+        A: Float + FromPrimitive;
+
+    /// Returns the *p*-th [central moment] of all elements in the array, μₚ:
+    ///
+    /// ```text
+    ///      1  n
+    /// μₚ = ―  ∑ (xᵢ-x̅)ᵖ
+    ///      n i=1
+    /// ```
+    ///
+    /// If the array is empty, `Err(EmptyInput)` is returned.
+    ///
+    /// The *p*-th central moment is computed using a corrected two-pass algorithm (see Section 3.5
+    /// in [Pébay et al., 2016]). Complexity is *O(np)* when *n >> p*, *p > 1*.
+    ///
+    /// **Panics** if `A::from_usize()` fails to convert the number of elements
+    /// in the array or if `order` overflows `i32`.
+    ///
+    /// [central moment]: https://en.wikipedia.org/wiki/Central_moment
+    /// [Pébay et al., 2016]: https://www.osti.gov/pages/servlets/purl/1427275
+    fn central_moment(&self, order: u16) -> Result<A, EmptyInput>
+    where
+        A: Float + FromPrimitive;
+
+    /// Returns the first *p* [central moments] of all elements in the array, see [central moment]
+    /// for more details.
+    ///
+    /// If the array is empty, `Err(EmptyInput)` is returned.
+    ///
+    /// This method reuses the intermediate steps for the *k*-th moment to compute the *(k+1)*-th,
+    /// being thus more efficient than repeated calls to [central moment] if the computation
+    /// of central moments of multiple orders is required.
+    ///
+    /// **Panics** if `A::from_usize()` fails to convert the number of elements
+    /// in the array or if `order` overflows `i32`.
+    ///
+    /// [central moments]: https://en.wikipedia.org/wiki/Central_moment
+    /// [central moment]: #tymethod.central_moment
+    fn central_moments(&self, order: u16) -> Result<Vec<A>, EmptyInput>
+    where
+        A: Float + FromPrimitive;
 }
 
 mod means;