From 2cddc79b1e6b30cf57cbb198570b27009c2a8289 Mon Sep 17 00:00:00 2001
From: Ricardo <ricardo.vieira1994@gmail.com>
Date: Tue, 1 Mar 2022 12:38:57 +0100
Subject: [PATCH 01/13] Set `expand=True` when calling `change_size` in
 `SymbolicDistribution`

---
 pymc/distributions/distribution.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pymc/distributions/distribution.py b/pymc/distributions/distribution.py
index 31a9cfb9a8..daa649fd2e 100644
--- a/pymc/distributions/distribution.py
+++ b/pymc/distributions/distribution.py
@@ -500,6 +500,7 @@ def __new__(
             rv_out = cls.change_size(
                 rv=rv_out,
                 new_size=resize_shape,
+                expand=True,
             )
 
         rv_out = model.register_rv(

From 6de9b20dbb6ca223aecac2ce383fc5dd3eff471a Mon Sep 17 00:00:00 2001
From: Ricardo <ricardo.vieira1994@gmail.com>
Date: Fri, 4 Feb 2022 09:40:23 +0100
Subject: [PATCH 02/13] Move NormalMixture tests to their own class

---
 pymc/tests/test_mixture.py | 190 +++++++++++++++++++------------------
 1 file changed, 98 insertions(+), 92 deletions(-)

diff --git a/pymc/tests/test_mixture.py b/pymc/tests/test_mixture.py
index bd231959d4..d6adfb9c6b 100644
--- a/pymc/tests/test_mixture.py
+++ b/pymc/tests/test_mixture.py
@@ -112,98 +112,6 @@ def test_mixture_list_of_normals(self):
             np.sort(trace["mu"].mean(axis=0)), np.sort(self.norm_mu), rtol=0.1, atol=0.1
         )
 
-    def test_normal_mixture(self):
-        with Model() as model:
-            w = Dirichlet("w", floatX(np.ones_like(self.norm_w)), shape=self.norm_w.size)
-            mu = Normal("mu", 0.0, 10.0, shape=self.norm_w.size)
-            tau = Gamma("tau", 1.0, 1.0, shape=self.norm_w.size)
-            NormalMixture("x_obs", w, mu, tau=tau, observed=self.norm_x)
-            step = Metropolis()
-            trace = sample(5000, step, random_seed=self.random_seed, progressbar=False, chains=1)
-
-        assert_allclose(np.sort(trace["w"].mean(axis=0)), np.sort(self.norm_w), rtol=0.1, atol=0.1)
-        assert_allclose(
-            np.sort(trace["mu"].mean(axis=0)), np.sort(self.norm_mu), rtol=0.1, atol=0.1
-        )
-
-    @pytest.mark.parametrize(
-        "nd,ncomp", [(tuple(), 5), (1, 5), (3, 5), ((3, 3), 5), (3, 3), ((3, 3), 3)], ids=str
-    )
-    def test_normal_mixture_nd(self, nd, ncomp):
-        nd = to_tuple(nd)
-        ncomp = int(ncomp)
-        comp_shape = nd + (ncomp,)
-        test_mus = np.random.randn(*comp_shape)
-        test_taus = np.random.gamma(1, 1, size=comp_shape)
-        observed = generate_normal_mixture_data(
-            w=np.ones(ncomp) / ncomp, mu=test_mus, sd=1 / np.sqrt(test_taus), size=10
-        )
-
-        with Model() as model0:
-            mus = Normal("mus", shape=comp_shape)
-            taus = Gamma("taus", alpha=1, beta=1, shape=comp_shape)
-            ws = Dirichlet("ws", np.ones(ncomp), shape=(ncomp,))
-            mixture0 = NormalMixture("m", w=ws, mu=mus, tau=taus, shape=nd, comp_shape=comp_shape)
-            obs0 = NormalMixture(
-                "obs", w=ws, mu=mus, tau=taus, shape=nd, comp_shape=comp_shape, observed=observed
-            )
-
-        with Model() as model1:
-            mus = Normal("mus", shape=comp_shape)
-            taus = Gamma("taus", alpha=1, beta=1, shape=comp_shape)
-            ws = Dirichlet("ws", np.ones(ncomp), shape=(ncomp,))
-            comp_dist = [
-                Normal.dist(mu=mus[..., i], tau=taus[..., i], shape=nd) for i in range(ncomp)
-            ]
-            mixture1 = Mixture("m", w=ws, comp_dists=comp_dist, shape=nd)
-            obs1 = Mixture("obs", w=ws, comp_dists=comp_dist, shape=nd, observed=observed)
-
-        with Model() as model2:
-            # Expected to fail if comp_shape is not provided,
-            # nd is multidim and it does not broadcast with ncomp. If by chance
-            # it does broadcast, an error is raised if the mixture is given
-            # observed data.
-            # Furthermore, the Mixture will also raise errors when the observed
-            # data is multidimensional but it does not broadcast well with
-            # comp_dists.
-            mus = Normal("mus", shape=comp_shape)
-            taus = Gamma("taus", alpha=1, beta=1, shape=comp_shape)
-            ws = Dirichlet("ws", np.ones(ncomp), shape=(ncomp,))
-            if len(nd) > 1:
-                if nd[-1] != ncomp:
-                    with pytest.raises(ValueError):
-                        NormalMixture("m", w=ws, mu=mus, tau=taus, shape=nd)
-                    mixture2 = None
-                else:
-                    mixture2 = NormalMixture("m", w=ws, mu=mus, tau=taus, shape=nd)
-            else:
-                mixture2 = NormalMixture("m", w=ws, mu=mus, tau=taus, shape=nd)
-            observed_fails = False
-            if len(nd) >= 1 and nd != (1,):
-                try:
-                    np.broadcast(np.empty(comp_shape), observed)
-                except Exception:
-                    observed_fails = True
-            if observed_fails:
-                with pytest.raises(ValueError):
-                    NormalMixture("obs", w=ws, mu=mus, tau=taus, shape=nd, observed=observed)
-                obs2 = None
-            else:
-                obs2 = NormalMixture("obs", w=ws, mu=mus, tau=taus, shape=nd, observed=observed)
-
-        testpoint = model0.compute_initial_point()
-        testpoint["mus"] = test_mus
-        testpoint["taus"] = test_taus
-        assert_allclose(model0.logp(testpoint), model1.logp(testpoint))
-        assert_allclose(mixture0.logp(testpoint), mixture1.logp(testpoint))
-        assert_allclose(obs0.logp(testpoint), obs1.logp(testpoint))
-        if mixture2 is not None and obs2 is not None:
-            assert_allclose(model0.logp(testpoint), model2.logp(testpoint))
-        if mixture2 is not None:
-            assert_allclose(mixture0.logp(testpoint), mixture2.logp(testpoint))
-        if obs2 is not None:
-            assert_allclose(obs0.logp(testpoint), obs2.logp(testpoint))
-
     def test_poisson_mixture(self):
         with Model() as model:
             w = Dirichlet("w", floatX(np.ones_like(self.pois_w)), shape=self.pois_w.shape)
@@ -388,6 +296,104 @@ def build_toy_dataset(N, K):
         assert prior["chol_cov_0"].shape == (n_samples, D * (D + 1) // 2)
 
 
+class TestNormalMixture(SeededTest):
+    @classmethod
+    def setup_class(cls):
+        TestMixture.setup_class()
+
+    def test_normal_mixture(self):
+        with Model() as model:
+            w = Dirichlet("w", floatX(np.ones_like(self.norm_w)), shape=self.norm_w.size)
+            mu = Normal("mu", 0.0, 10.0, shape=self.norm_w.size)
+            tau = Gamma("tau", 1.0, 1.0, shape=self.norm_w.size)
+            NormalMixture("x_obs", w, mu, tau=tau, observed=self.norm_x)
+            step = Metropolis()
+            trace = sample(5000, step, random_seed=self.random_seed, progressbar=False, chains=1)
+
+        assert_allclose(np.sort(trace["w"].mean(axis=0)), np.sort(self.norm_w), rtol=0.1, atol=0.1)
+        assert_allclose(
+            np.sort(trace["mu"].mean(axis=0)), np.sort(self.norm_mu), rtol=0.1, atol=0.1
+        )
+
+    @pytest.mark.parametrize(
+        "nd,ncomp", [(tuple(), 5), (1, 5), (3, 5), ((3, 3), 5), (3, 3), ((3, 3), 3)], ids=str
+    )
+    def test_normal_mixture_nd(self, nd, ncomp):
+        nd = to_tuple(nd)
+        ncomp = int(ncomp)
+        comp_shape = nd + (ncomp,)
+        test_mus = np.random.randn(*comp_shape)
+        test_taus = np.random.gamma(1, 1, size=comp_shape)
+        observed = generate_normal_mixture_data(
+            w=np.ones(ncomp) / ncomp, mu=test_mus, sd=1 / np.sqrt(test_taus), size=10
+        )
+
+        with Model() as model0:
+            mus = Normal("mus", shape=comp_shape)
+            taus = Gamma("taus", alpha=1, beta=1, shape=comp_shape)
+            ws = Dirichlet("ws", np.ones(ncomp), shape=(ncomp,))
+            mixture0 = NormalMixture("m", w=ws, mu=mus, tau=taus, shape=nd, comp_shape=comp_shape)
+            obs0 = NormalMixture(
+                "obs", w=ws, mu=mus, tau=taus, shape=nd, comp_shape=comp_shape, observed=observed
+            )
+
+        with Model() as model1:
+            mus = Normal("mus", shape=comp_shape)
+            taus = Gamma("taus", alpha=1, beta=1, shape=comp_shape)
+            ws = Dirichlet("ws", np.ones(ncomp), shape=(ncomp,))
+            comp_dist = [
+                Normal.dist(mu=mus[..., i], tau=taus[..., i], shape=nd) for i in range(ncomp)
+            ]
+            mixture1 = Mixture("m", w=ws, comp_dists=comp_dist, shape=nd)
+            obs1 = Mixture("obs", w=ws, comp_dists=comp_dist, shape=nd, observed=observed)
+
+        with Model() as model2:
+            # Expected to fail if comp_shape is not provided,
+            # nd is multidim and it does not broadcast with ncomp. If by chance
+            # it does broadcast, an error is raised if the mixture is given
+            # observed data.
+            # Furthermore, the Mixture will also raise errors when the observed
+            # data is multidimensional but it does not broadcast well with
+            # comp_dists.
+            mus = Normal("mus", shape=comp_shape)
+            taus = Gamma("taus", alpha=1, beta=1, shape=comp_shape)
+            ws = Dirichlet("ws", np.ones(ncomp), shape=(ncomp,))
+            if len(nd) > 1:
+                if nd[-1] != ncomp:
+                    with pytest.raises(ValueError):
+                        NormalMixture("m", w=ws, mu=mus, tau=taus, shape=nd)
+                    mixture2 = None
+                else:
+                    mixture2 = NormalMixture("m", w=ws, mu=mus, tau=taus, shape=nd)
+            else:
+                mixture2 = NormalMixture("m", w=ws, mu=mus, tau=taus, shape=nd)
+            observed_fails = False
+            if len(nd) >= 1 and nd != (1,):
+                try:
+                    np.broadcast(np.empty(comp_shape), observed)
+                except Exception:
+                    observed_fails = True
+            if observed_fails:
+                with pytest.raises(ValueError):
+                    NormalMixture("obs", w=ws, mu=mus, tau=taus, shape=nd, observed=observed)
+                obs2 = None
+            else:
+                obs2 = NormalMixture("obs", w=ws, mu=mus, tau=taus, shape=nd, observed=observed)
+
+        testpoint = model0.compute_initial_point()
+        testpoint["mus"] = test_mus
+        testpoint["taus"] = test_taus
+        assert_allclose(model0.logp(testpoint), model1.logp(testpoint))
+        assert_allclose(mixture0.logp(testpoint), mixture1.logp(testpoint))
+        assert_allclose(obs0.logp(testpoint), obs1.logp(testpoint))
+        if mixture2 is not None and obs2 is not None:
+            assert_allclose(model0.logp(testpoint), model2.logp(testpoint))
+        if mixture2 is not None:
+            assert_allclose(mixture0.logp(testpoint), mixture2.logp(testpoint))
+        if obs2 is not None:
+            assert_allclose(obs0.logp(testpoint), obs2.logp(testpoint))
+
+
 class TestMixtureVsLatent(SeededTest):
     def setup_method(self, *args, **kwargs):
         super().setup_method(*args, **kwargs)

From 0291fcb01335f3a6f835cdb0205ae6c223eb97d9 Mon Sep 17 00:00:00 2001
From: Ricardo <ricardo.vieira1994@gmail.com>
Date: Fri, 4 Feb 2022 10:08:59 +0100
Subject: [PATCH 03/13] Move mixture random tests from
 test_distributions_random to test_mixture

---
 pymc/tests/test_distributions_random.py | 102 ------------------------
 pymc/tests/test_mixture.py              |  67 ++++++++++++++++
 2 files changed, 67 insertions(+), 102 deletions(-)

diff --git a/pymc/tests/test_distributions_random.py b/pymc/tests/test_distributions_random.py
index c400d8e57b..3dbba9c5a4 100644
--- a/pymc/tests/test_distributions_random.py
+++ b/pymc/tests/test_distributions_random.py
@@ -19,7 +19,6 @@
 
 import aesara
 import numpy as np
-import numpy.random as nr
 import numpy.testing as npt
 import pytest
 import scipy.stats as st
@@ -59,7 +58,6 @@ def random_polyagamma(*args, **kwargs):
     R,
     RandomPdMatrix,
     Rplus,
-    Simplex,
     build_model,
     product,
 )
@@ -2045,106 +2043,6 @@ def check_draws_match_expected(self):
         assert np.all(np.abs(x.eval() - np.array([0.5, 0, 2.0])) < 0.01)
 
 
-class TestScalarParameterSamples(SeededTest):
-    @pytest.mark.xfail(reason="This distribution has not been refactored for v4")
-    def test_normalmixture(self):
-        def ref_rand(size, w, mu, sigma):
-            component = np.random.choice(w.size, size=size, p=w)
-            return np.random.normal(mu[component], sigma[component], size=size)
-
-        pymc_random(
-            pm.NormalMixture,
-            {
-                "w": Simplex(2),
-                "mu": Domain([[0.05, 2.5], [-5.0, 1.0]], edges=(None, None)),
-                "sigma": Domain([[1, 1], [1.5, 2.0]], edges=(None, None)),
-            },
-            extra_args={"comp_shape": 2},
-            size=1000,
-            ref_rand=ref_rand,
-        )
-        pymc_random(
-            pm.NormalMixture,
-            {
-                "w": Simplex(3),
-                "mu": Domain([[-5.0, 1.0, 2.5]], edges=(None, None)),
-                "sigma": Domain([[1.5, 2.0, 3.0]], edges=(None, None)),
-            },
-            extra_args={"comp_shape": 3},
-            size=1000,
-            ref_rand=ref_rand,
-        )
-
-
-@pytest.mark.xfail(reason="This distribution has not been refactored for v4")
-def test_mixture_random_shape():
-    # test the shape broadcasting in mixture random
-    y = np.concatenate([nr.poisson(5, size=10), nr.poisson(9, size=10)])
-    with pm.Model() as m:
-        comp0 = pm.Poisson.dist(mu=np.ones(2))
-        w0 = pm.Dirichlet("w0", a=np.ones(2), shape=(2,))
-        like0 = pm.Mixture("like0", w=w0, comp_dists=comp0, observed=y)
-
-        comp1 = pm.Poisson.dist(mu=np.ones((20, 2)), shape=(20, 2))
-        w1 = pm.Dirichlet("w1", a=np.ones(2), shape=(2,))
-        like1 = pm.Mixture("like1", w=w1, comp_dists=comp1, observed=y)
-
-        comp2 = pm.Poisson.dist(mu=np.ones(2))
-        w2 = pm.Dirichlet("w2", a=np.ones(2), shape=(20, 2))
-        like2 = pm.Mixture("like2", w=w2, comp_dists=comp2, observed=y)
-
-        comp3 = pm.Poisson.dist(mu=np.ones(2), shape=(20, 2))
-        w3 = pm.Dirichlet("w3", a=np.ones(2), shape=(20, 2))
-        like3 = pm.Mixture("like3", w=w3, comp_dists=comp3, observed=y)
-
-    # XXX: This needs to be refactored
-    rand0, rand1, rand2, rand3 = [None] * 4  # draw_values(
-    #     [like0, like1, like2, like3], point=m.initial_point, size=100
-    # )
-    assert rand0.shape == (100, 20)
-    assert rand1.shape == (100, 20)
-    assert rand2.shape == (100, 20)
-    assert rand3.shape == (100, 20)
-
-    with m:
-        ppc = pm.sample_posterior_predictive([m.compute_initial_point()], samples=200)
-    assert ppc["like0"].shape == (200, 20)
-    assert ppc["like1"].shape == (200, 20)
-    assert ppc["like2"].shape == (200, 20)
-    assert ppc["like3"].shape == (200, 20)
-
-
-@pytest.mark.xfail(reason="This distribution has not been refactored for v4")
-def test_mixture_random_shape_fast():
-    # test the shape broadcasting in mixture random
-    y = np.concatenate([nr.poisson(5, size=10), nr.poisson(9, size=10)])
-    with pm.Model() as m:
-        comp0 = pm.Poisson.dist(mu=np.ones(2))
-        w0 = pm.Dirichlet("w0", a=np.ones(2), shape=(2,))
-        like0 = pm.Mixture("like0", w=w0, comp_dists=comp0, observed=y)
-
-        comp1 = pm.Poisson.dist(mu=np.ones((20, 2)), shape=(20, 2))
-        w1 = pm.Dirichlet("w1", a=np.ones(2), shape=(2,))
-        like1 = pm.Mixture("like1", w=w1, comp_dists=comp1, observed=y)
-
-        comp2 = pm.Poisson.dist(mu=np.ones(2))
-        w2 = pm.Dirichlet("w2", a=np.ones(2), shape=(20, 2))
-        like2 = pm.Mixture("like2", w=w2, comp_dists=comp2, observed=y)
-
-        comp3 = pm.Poisson.dist(mu=np.ones(2), shape=(20, 2))
-        w3 = pm.Dirichlet("w3", a=np.ones(2), shape=(20, 2))
-        like3 = pm.Mixture("like3", w=w3, comp_dists=comp3, observed=y)
-
-    # XXX: This needs to be refactored
-    rand0, rand1, rand2, rand3 = [None] * 4  # draw_values(
-    #     [like0, like1, like2, like3], point=m.initial_point, size=100
-    # )
-    assert rand0.shape == (100, 20)
-    assert rand1.shape == (100, 20)
-    assert rand2.shape == (100, 20)
-    assert rand3.shape == (100, 20)
-
-
 class TestDensityDist:
     @pytest.mark.parametrize("size", [(), (3,), (3, 2)], ids=str)
     def test_density_dist_with_random(self, size):
diff --git a/pymc/tests/test_mixture.py b/pymc/tests/test_mixture.py
index d6adfb9c6b..04ff6bd9e0 100644
--- a/pymc/tests/test_mixture.py
+++ b/pymc/tests/test_mixture.py
@@ -40,6 +40,8 @@
 from pymc.aesaraf import floatX
 from pymc.distributions.shape_utils import to_tuple
 from pymc.tests.helpers import SeededTest
+from pymc.tests.test_distributions import Domain, Simplex
+from pymc.tests.test_distributions_random import pymc_random
 
 pytestmark = pytest.mark.xfail(reason="Mixture not refactored.")
 
@@ -295,6 +297,43 @@ def build_toy_dataset(N, K):
         assert prior["mu0"].shape == (n_samples, D)
         assert prior["chol_cov_0"].shape == (n_samples, D * (D + 1) // 2)
 
+    @pytest.mark.xfail(reason="This distribution has not been refactored for v4")
+    def test_mixture_random_shape(self):
+        # test the shape broadcasting in mixture random
+        y = np.concatenate([np.random.poisson(5, size=10), np.random.poisson(9, size=10)])
+        with pm.Model() as m:
+            comp0 = pm.Poisson.dist(mu=np.ones(2))
+            w0 = pm.Dirichlet("w0", a=np.ones(2), shape=(2,))
+            like0 = pm.Mixture("like0", w=w0, comp_dists=comp0, observed=y)
+
+            comp1 = pm.Poisson.dist(mu=np.ones((20, 2)), shape=(20, 2))
+            w1 = pm.Dirichlet("w1", a=np.ones(2), shape=(2,))
+            like1 = pm.Mixture("like1", w=w1, comp_dists=comp1, observed=y)
+
+            comp2 = pm.Poisson.dist(mu=np.ones(2))
+            w2 = pm.Dirichlet("w2", a=np.ones(2), shape=(20, 2))
+            like2 = pm.Mixture("like2", w=w2, comp_dists=comp2, observed=y)
+
+            comp3 = pm.Poisson.dist(mu=np.ones(2), shape=(20, 2))
+            w3 = pm.Dirichlet("w3", a=np.ones(2), shape=(20, 2))
+            like3 = pm.Mixture("like3", w=w3, comp_dists=comp3, observed=y)
+
+        # XXX: This needs to be refactored
+        rand0, rand1, rand2, rand3 = [None] * 4  # draw_values(
+        #     [like0, like1, like2, like3], point=m.initial_point, size=100
+        # )
+        assert rand0.shape == (100, 20)
+        assert rand1.shape == (100, 20)
+        assert rand2.shape == (100, 20)
+        assert rand3.shape == (100, 20)
+
+        with m:
+            ppc = pm.sample_posterior_predictive([m.compute_initial_point()], samples=200)
+        assert ppc["like0"].shape == (200, 20)
+        assert ppc["like1"].shape == (200, 20)
+        assert ppc["like2"].shape == (200, 20)
+        assert ppc["like3"].shape == (200, 20)
+
 
 class TestNormalMixture(SeededTest):
     @classmethod
@@ -393,6 +432,34 @@ def test_normal_mixture_nd(self, nd, ncomp):
         if obs2 is not None:
             assert_allclose(obs0.logp(testpoint), obs2.logp(testpoint))
 
+    def test_random(self):
+        def ref_rand(size, w, mu, sigma):
+            component = np.random.choice(w.size, size=size, p=w)
+            return np.random.normal(mu[component], sigma[component], size=size)
+
+        pymc_random(
+            pm.NormalMixture,
+            {
+                "w": Simplex(2),
+                "mu": Domain([[0.05, 2.5], [-5.0, 1.0]], edges=(None, None)),
+                "sigma": Domain([[1, 1], [1.5, 2.0]], edges=(None, None)),
+            },
+            extra_args={"comp_shape": 2},
+            size=1000,
+            ref_rand=ref_rand,
+        )
+        pymc_random(
+            pm.NormalMixture,
+            {
+                "w": Simplex(3),
+                "mu": Domain([[-5.0, 1.0, 2.5]], edges=(None, None)),
+                "sigma": Domain([[1.5, 2.0, 3.0]], edges=(None, None)),
+            },
+            extra_args={"comp_shape": 3},
+            size=1000,
+            ref_rand=ref_rand,
+        )
+
 
 class TestMixtureVsLatent(SeededTest):
     def setup_method(self, *args, **kwargs):

From 88b1240e997c9d3f02647fe0bdd0eb298d183675 Mon Sep 17 00:00:00 2001
From: Ricardo <ricardo.vieira1994@gmail.com>
Date: Fri, 4 Feb 2022 09:47:07 +0100
Subject: [PATCH 04/13] Use specific imports in test_mixture

---
 pymc/tests/test_mixture.py | 88 +++++++++++++++++++-------------------
 1 file changed, 45 insertions(+), 43 deletions(-)

diff --git a/pymc/tests/test_mixture.py b/pymc/tests/test_mixture.py
index 04ff6bd9e0..bb9a8801c5 100644
--- a/pymc/tests/test_mixture.py
+++ b/pymc/tests/test_mixture.py
@@ -21,24 +21,28 @@
 from numpy.testing import assert_allclose
 from scipy.special import logsumexp
 
-import pymc as pm
-
-from pymc import (
+from pymc.aesaraf import floatX
+from pymc.distributions import (
+    Categorical,
     Dirichlet,
     Exponential,
     Gamma,
+    HalfNormal,
+    LKJCholeskyCov,
     LogNormal,
-    Metropolis,
     Mixture,
-    Model,
+    MixtureSameFamily,
+    Multinomial,
     MvNormal,
     Normal,
     NormalMixture,
     Poisson,
-    sample,
 )
-from pymc.aesaraf import floatX
 from pymc.distributions.shape_utils import to_tuple
+from pymc.math import expand_packed_triangular
+from pymc.model import Model
+from pymc.sampling import sample, sample_posterior_predictive, sample_prior_predictive
+from pymc.step_methods import Metropolis
 from pymc.tests.helpers import SeededTest
 from pymc.tests.test_distributions import Domain, Simplex
 from pymc.tests.test_distributions_random import pymc_random
@@ -267,31 +271,31 @@ def build_toy_dataset(N, K):
 
         X, y = build_toy_dataset(N, K)
 
-        with pm.Model() as model:
-            pi = pm.Dirichlet("pi", np.ones(K), shape=(K,))
+        with Model() as model:
+            pi = Dirichlet("pi", np.ones(K), shape=(K,))
 
             comp_dist = []
             mu = []
             packed_chol = []
             chol = []
             for i in range(K):
-                mu.append(pm.Normal("mu%i" % i, 0, 10, shape=D))
+                mu.append(Normal("mu%i" % i, 0, 10, shape=D))
                 packed_chol.append(
-                    pm.LKJCholeskyCov(
-                        "chol_cov_%i" % i, eta=2, n=D, sd_dist=pm.HalfNormal.dist(2.5, size=D)
+                    LKJCholeskyCov(
+                        "chol_cov_%i" % i, eta=2, n=D, sd_dist=HalfNormal.dist(2.5, size=D)
                     )
                 )
-                chol.append(pm.expand_packed_triangular(D, packed_chol[i], lower=True))
-                comp_dist.append(pm.MvNormal.dist(mu=mu[i], chol=chol[i], shape=D))
+                chol.append(expand_packed_triangular(D, packed_chol[i], lower=True))
+                comp_dist.append(MvNormal.dist(mu=mu[i], chol=chol[i], shape=D))
 
-            pm.Mixture("x_obs", pi, comp_dist, observed=X)
+            Mixture("x_obs", pi, comp_dist, observed=X)
         with model:
-            idata = pm.sample(30, tune=10, chains=1)
+            idata = sample(30, tune=10, chains=1)
 
         n_samples = 20
         with model:
-            ppc = pm.sample_posterior_predictive(idata, n_samples)
-            prior = pm.sample_prior_predictive(samples=n_samples)
+            ppc = sample_posterior_predictive(idata, n_samples)
+            prior = sample_prior_predictive(samples=n_samples)
         assert ppc["x_obs"].shape == (n_samples,) + X.shape
         assert prior["x_obs"].shape == (n_samples,) + X.shape
         assert prior["mu0"].shape == (n_samples, D)
@@ -438,7 +442,7 @@ def ref_rand(size, w, mu, sigma):
             return np.random.normal(mu[component], sigma[component], size=size)
 
         pymc_random(
-            pm.NormalMixture,
+            NormalMixture,
             {
                 "w": Simplex(2),
                 "mu": Domain([[0.05, 2.5], [-5.0, 1.0]], edges=(None, None)),
@@ -449,7 +453,7 @@ def ref_rand(size, w, mu, sigma):
             ref_rand=ref_rand,
         )
         pymc_random(
-            pm.NormalMixture,
+            NormalMixture,
             {
                 "w": Simplex(3),
                 "mu": Domain([[-5.0, 1.0, 2.5]], edges=(None, None)),
@@ -487,12 +491,12 @@ def test_1d_w(self):
         npop = self.npop
         mus = self.mus
         size = 100
-        with pm.Model() as model:
-            m = pm.NormalMixture(
+        with Model() as model:
+            m = NormalMixture(
                 "m", w=np.ones(npop) / npop, mu=mus, sigma=1e-5, comp_shape=(nd, npop), shape=nd
             )
-            z = pm.Categorical("z", p=np.ones(npop) / npop)
-            latent_m = pm.Normal("latent_m", mu=mus[..., z], sigma=1e-5, shape=nd)
+            z = Categorical("z", p=np.ones(npop) / npop)
+            latent_m = Normal("latent_m", mu=mus[..., z], sigma=1e-5, shape=nd)
 
         m_val = m.random(size=size)
         latent_m_val = latent_m.random(size=size)
@@ -510,8 +514,8 @@ def test_2d_w(self):
         npop = self.npop
         mus = self.mus
         size = 100
-        with pm.Model() as model:
-            m = pm.NormalMixture(
+        with Model() as model:
+            m = NormalMixture(
                 "m",
                 w=np.ones((nd, npop)) / npop,
                 mu=mus,
@@ -519,9 +523,9 @@ def test_2d_w(self):
                 comp_shape=(nd, npop),
                 shape=nd,
             )
-            z = pm.Categorical("z", p=np.ones(npop) / npop, shape=nd)
+            z = Categorical("z", p=np.ones(npop) / npop, shape=nd)
             mu = at.as_tensor_variable([mus[i, z[i]] for i in range(nd)])
-            latent_m = pm.Normal("latent_m", mu=mu, sigma=1e-5, shape=nd)
+            latent_m = Normal("latent_m", mu=mu, sigma=1e-5, shape=nd)
 
         m_val = m.random(size=size)
         latent_m_val = latent_m.random(size=size)
@@ -583,16 +587,16 @@ def test_with_multinomial(self, batch_shape):
         n = 100 * np.ones((*batch_shape, 1))
         w = np.ones(self.mixture_comps) / self.mixture_comps
         mixture_axis = len(batch_shape)
-        with pm.Model() as model:
-            comp_dists = pm.Multinomial.dist(p=p, n=n, shape=(*batch_shape, self.mixture_comps, 3))
-            mixture = pm.MixtureSameFamily(
+        with Model() as model:
+            comp_dists = Multinomial.dist(p=p, n=n, shape=(*batch_shape, self.mixture_comps, 3))
+            mixture = MixtureSameFamily(
                 "mixture",
                 w=w,
                 comp_dists=comp_dists,
                 mixture_axis=mixture_axis,
                 shape=(*batch_shape, 3),
             )
-            prior = pm.sample_prior_predictive(samples=self.n_samples)
+            prior = sample_prior_predictive(samples=self.n_samples)
 
         assert prior["mixture"].shape == (self.n_samples, *batch_shape, 3)
         assert mixture.random(size=self.size).shape == (self.size, *batch_shape, 3)
@@ -623,12 +627,12 @@ def test_with_mvnormal(self):
         chol = np.linalg.cholesky(cov)
         w = np.ones(self.mixture_comps) / self.mixture_comps
 
-        with pm.Model() as model:
-            comp_dists = pm.MvNormal.dist(mu=mu, chol=chol, shape=(self.mixture_comps, 3))
-            mixture = pm.MixtureSameFamily(
+        with Model() as model:
+            comp_dists = MvNormal.dist(mu=mu, chol=chol, shape=(self.mixture_comps, 3))
+            mixture = MixtureSameFamily(
                 "mixture", w=w, comp_dists=comp_dists, mixture_axis=0, shape=(3,)
             )
-            prior = pm.sample_prior_predictive(samples=self.n_samples)
+            prior = sample_prior_predictive(samples=self.n_samples)
 
         assert prior["mixture"].shape == (self.n_samples, 3)
         assert mixture.random(size=self.size).shape == (self.size, 3)
@@ -650,12 +654,10 @@ def test_with_mvnormal(self):
         )
 
     def test_broadcasting_in_shape(self):
-        with pm.Model() as model:
-            mu = pm.Gamma("mu", 1.0, 1.0, shape=2)
-            comp_dists = pm.Poisson.dist(mu, shape=2)
-            mix = pm.MixtureSameFamily(
-                "mix", w=np.ones(2) / 2, comp_dists=comp_dists, shape=(1000,)
-            )
-            prior = pm.sample_prior_predictive(samples=self.n_samples)
+        with Model() as model:
+            mu = Gamma("mu", 1.0, 1.0, shape=2)
+            comp_dists = Poisson.dist(mu, shape=2)
+            mix = MixtureSameFamily("mix", w=np.ones(2) / 2, comp_dists=comp_dists, shape=(1000,))
+            prior = sample_prior_predictive(samples=self.n_samples)
 
         assert prior["mix"].shape == (self.n_samples, 1000)

From 0db54144d9571b872a5cea5f07bedf0274f4181f Mon Sep 17 00:00:00 2001
From: Ricardo <ricardo.vieira1994@gmail.com>
Date: Fri, 4 Feb 2022 13:51:45 +0100
Subject: [PATCH 05/13] Reenable Mixture tests in pytest workflow

---
 .github/workflows/pytest.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index 9f6129d45e..76ed533fb8 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -78,6 +78,7 @@ jobs:
             pymc/tests/test_transforms.py
             pymc/tests/test_smc.py
             pymc/tests/test_bart.py
+            pymc/tests/test_mixture.py
 
           - |
             pymc/tests/test_parallel_sampling.py

From 6a1d7391ebe0b5c43af5fdaec9ea60980dfd9aa6 Mon Sep 17 00:00:00 2001
From: Ricardo <ricardo.vieira1994@gmail.com>
Date: Tue, 1 Feb 2022 15:54:25 +0100
Subject: [PATCH 06/13] Refactor Mixture distribution

Mixtures now use an `OpFromGraph` that encapsulates the Aesara random method. This is used so that logp can be easily dispatched to the distribution without requiring involved pattern matching. The Mixture random and logp methods now fully respect the support dimensionality of its components, whereas previously only the logp method did, leading to inconsistencies between the two methods.

In the case where the weights (or size) indicate the need for more draws than what is given by the component distributions, the latter are resized to ensure there are no repeated draws.

This refactoring forces Mixture components to be basic RandomVariables, meaning that nested Mixtures or Mixtures of Symbolic distributions (like Censored) are not currently possible.

Co-authored-by: Larry Dong <larry.dong@mail.utoronto.ca>
---
 pymc/distributions/mixture.py | 661 ++++++++++++----------------------
 pymc/tests/test_mixture.py    | 625 ++++++++++++++++++++++++--------
 2 files changed, 698 insertions(+), 588 deletions(-)

diff --git a/pymc/distributions/mixture.py b/pymc/distributions/mixture.py
index 6105bb726c..bd24748d73 100644
--- a/pymc/distributions/mixture.py
+++ b/pymc/distributions/mixture.py
@@ -11,19 +11,24 @@
 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
-
-from collections.abc import Iterable
-
 import aesara
 import aesara.tensor as at
 import numpy as np
 
-from pymc.aesaraf import _conversion_map, take_along_axis
-from pymc.distributions.continuous import Normal, get_tau_sigma
+from aeppl.abstract import MeasurableVariable, _get_measurable_outputs
+from aeppl.logprob import _logprob
+from aesara.compile.builders import OpFromGraph
+from aesara.tensor import TensorVariable
+from aesara.tensor.random.op import RandomVariable
+
+from pymc.aesaraf import change_rv_size, take_along_axis
+from pymc.distributions.continuous import Normal
 from pymc.distributions.dist_math import check_parameters
-from pymc.distributions.distribution import Discrete, Distribution
+from pymc.distributions.distribution import Discrete, Distribution, SymbolicDistribution
+from pymc.distributions.logprob import logp
 from pymc.distributions.shape_utils import to_tuple
 from pymc.math import logsumexp
+from pymc.util import check_dist_not_registered
 
 __all__ = ["Mixture", "NormalMixture", "MixtureSameFamily"]
 
@@ -38,7 +43,14 @@ def all_discrete(comp_dists):
         return all(isinstance(comp_dist, Discrete) for comp_dist in comp_dists)
 
 
-class Mixture(Distribution):
+class MarginalMixtureRV(OpFromGraph):
+    """A placeholder used to specify a log-likelihood for a mixture sub-graph."""
+
+
+MeasurableVariable.register(MarginalMixtureRV)
+
+
+class Mixture(SymbolicDistribution):
     R"""
     Mixture log-likelihood
 
@@ -112,454 +124,239 @@ class Mixture(Distribution):
             like = pm.Mixture('like', w=w, comp_dists = components, observed=data, shape=3)
     """
 
-    def __init__(self, w, comp_dists, *args, **kwargs):
-        # comp_dists type checking
-        if not (
-            isinstance(comp_dists, Distribution)
-            or (
-                isinstance(comp_dists, Iterable)
-                and all(isinstance(c, Distribution) for c in comp_dists)
-            )
-        ):
-            raise TypeError(
-                "Supplied Mixture comp_dists must be a "
-                "Distribution or an iterable of "
-                "Distributions. Got {} instead.".format(
-                    type(comp_dists)
-                    if not isinstance(comp_dists, Iterable)
-                    else [type(c) for c in comp_dists]
+    @classmethod
+    def dist(cls, w, comp_dists, **kwargs):
+        if not isinstance(comp_dists, (tuple, list)):
+            # comp_dists is a single component
+            comp_dists = [comp_dists]
+
+        # Check that components are not associated with a registered variable in the model
+        components_ndim = set()
+        components_ndim_supp = set()
+        for dist in comp_dists:
+            # TODO: Allow these to not be a RandomVariable as long as we can call `ndim_supp` on them
+            #  and resize them
+            if not isinstance(dist, TensorVariable) or not isinstance(
+                dist.owner.op, RandomVariable
+            ):
+                raise ValueError(
+                    f"Component dist must be a distribution created via the `.dist()` API, got {type(dist)}"
                 )
+            check_dist_not_registered(dist)
+            components_ndim.add(dist.ndim)
+            components_ndim_supp.add(dist.owner.op.ndim_supp)
+
+        if len(components_ndim) > 1:
+            raise ValueError(
+                f"Mixture components must all have the same dimensionality, got {components_ndim}"
             )
-        shape = kwargs.pop("shape", ())
 
-        self.w = w = at.as_tensor_variable(w)
-        self.comp_dists = comp_dists
+        if len(components_ndim_supp) > 1:
+            raise ValueError(
+                f"Mixture components must all have the same support dimensionality, got {components_ndim_supp}"
+            )
 
-        defaults = kwargs.pop("defaults", [])
+        w = at.as_tensor_variable(w)
+        return super().dist([w, *comp_dists], **kwargs)
 
-        if all_discrete(comp_dists):
-            default_dtype = _conversion_map[aesara.config.floatX]
+    @classmethod
+    def rv_op(cls, weights, *components, size=None, rngs=None):
+        # Update rngs if provided
+        if rngs is not None:
+            components = cls._reseed_components(rngs, *components)
+            *_, mix_indexes_rng = rngs
         else:
-            default_dtype = aesara.config.floatX
-
-            try:
-                self.mean = (w * self._comp_means()).sum(axis=-1)
-
-                if "mean" not in defaults:
-                    defaults.append("mean")
-            except AttributeError:
-                pass
-        dtype = kwargs.pop("dtype", default_dtype)
-
-        try:
-            if isinstance(comp_dists, Distribution):
-                comp_mode_logps = comp_dists.logp(comp_dists.mode)
-            else:
-                comp_mode_logps = at.stack([cd.logp(cd.mode) for cd in comp_dists])
-
-            mode_idx = at.argmax(at.log(w) + comp_mode_logps, axis=-1)
-            self.mode = self._comp_modes()[mode_idx]
-
-            if "mode" not in defaults:
-                defaults.append("mode")
-        except (AttributeError, ValueError, IndexError):
-            pass
-
-        super().__init__(shape, dtype, defaults=defaults, *args, **kwargs)
-
-    @property
-    def comp_dists(self):
-        return self._comp_dists
-
-    @comp_dists.setter
-    def comp_dists(self, comp_dists):
-        self._comp_dists = comp_dists
-        if isinstance(comp_dists, Distribution):
-            self._comp_dist_shapes = to_tuple(comp_dists.shape)
-            self._broadcast_shape = self._comp_dist_shapes
-            self.comp_is_distribution = True
-        else:
-            # Now we check the comp_dists distribution shape, see what
-            # the broadcast shape would be. This shape will be the dist_shape
-            # used by generate samples (the shape of a single random sample)
-            # from the mixture
-            self._comp_dist_shapes = [to_tuple(d.shape) for d in comp_dists]
-            # All component distributions must broadcast with each other
-            try:
-                self._broadcast_shape = np.broadcast(
-                    *(np.empty(shape) for shape in self._comp_dist_shapes)
-                ).shape
-            except Exception:
-                raise TypeError(
-                    "Supplied comp_dists shapes do not broadcast "
-                    "with each other. comp_dists shapes are: "
-                    "{}".format(self._comp_dist_shapes)
-                )
+            # Create new rng for the mix_indexes internal RV
+            mix_indexes_rng = aesara.shared(np.random.default_rng())
+
+        if size is not None:
+            components = cls._resize_components(size, *components)
+
+        single_component = len(components) == 1
+
+        # Extract support and replication ndims from components and weights
+        ndim_supp = components[0].owner.op.ndim_supp
+        ndim_batch = components[0].ndim - ndim_supp
+        if single_component:
+            # One dimension is taken by the mixture axis in the single component case
+            ndim_batch -= 1
+
+        # The weights may imply extra batch dimensions that go beyond what is already
+        # implied by the component dimensions (ndim_batch)
+        weights_ndim_batch = max(0, weights.ndim - ndim_batch - 1)
+
+        # If weights are large enough that they would broadcast the component distributions
+        # we try to resize them. This in necessary to avoid duplicated values in the
+        # random method and for equivalency with the logp method
+        if weights_ndim_batch:
+            new_size = at.concatenate(
+                [
+                    weights.shape[:weights_ndim_batch],
+                    components[0].shape[:ndim_batch],
+                ]
+            )
+            components = cls._resize_components(new_size, *components)
 
-            # We wrap the _comp_dist.random by adding the kwarg raw_size_,
-            # which will be the size attribute passed to _comp_samples.
-            # _comp_samples then calls generate_samples, which may change the
-            # size value to make it compatible with scipy.stats.*.rvs
-            self._generators = []
-            for comp_dist in comp_dists:
-                generator = Mixture._comp_dist_random_wrapper(comp_dist.random)
-                self._generators.append(generator)
-            self.comp_is_distribution = False
-
-    @staticmethod
-    def _comp_dist_random_wrapper(random):
-        """Wrap the comp_dists.random method to take the kwarg raw_size_ and
-        use it's value to replace the size parameter. This is needed because
-        generate_samples makes the size value compatible with the
-        scipy.stats.*.rvs, where size has a different meaning than in the
-        distributions' random methods.
-        """
+            # Extract support and batch ndims from components and weights
+            ndim_batch = components[0].ndim - ndim_supp
+            if single_component:
+                ndim_batch -= 1
+            weights_ndim_batch = max(0, weights.ndim - ndim_batch - 1)
 
-        def wrapped_random(*args, **kwargs):
-            raw_size_ = kwargs.pop("raw_size_", None)
-            # Distribution.random's signature is always (point=None, size=None)
-            # so size could be the second arg or be given as a kwarg
-            if len(args) > 1:
-                args[1] = raw_size_
-            else:
-                kwargs["size"] = raw_size_
-            return random(*args, **kwargs)
+        assert weights_ndim_batch == 0
 
-        return wrapped_random
+        # Create a OpFromGraph that encapsulates the random generating process
+        # Create dummy input variables with the same type as the ones provided
+        weights_ = weights.type()
+        components_ = [component.type() for component in components]
+        mix_indexes_rng_ = mix_indexes_rng.type()
 
-    def _comp_logp(self, value):
-        comp_dists = self.comp_dists
+        mix_axis = -ndim_supp - 1
 
-        if self.comp_is_distribution:
-            # Value can be many things. It can be the self tensor, the mode
-            # test point or it can be observed data. The latter case requires
-            # careful handling of shape, as the observed's shape could look
-            # like (repetitions,) + dist_shape, which does not include the last
-            # mixture axis. For this reason, we try to eval the value.shape,
-            # compare it with self.shape and shape_padright if we infer that
-            # the value holds observed data
-            try:
-                val_shape = tuple(value.shape.eval())
-            except AttributeError:
-                val_shape = value.shape
-            except aesara.graph.fg.MissingInputError:
-                val_shape = None
-            try:
-                self_shape = tuple(self.shape)
-            except AttributeError:
-                # Happens in __init__ when computing self.logp(comp_modes)
-                self_shape = None
-            comp_shape = tuple(comp_dists.shape)
-            ndim = value.ndim
-            if val_shape is not None and not (
-                (self_shape is not None and val_shape == self_shape) or val_shape == comp_shape
-            ):
-                # value is neither the test point nor the self tensor, it
-                # is likely to hold observed values, so we must compute the
-                # ndim discarding the dimensions that don't match
-                # self_shape
-                if self_shape and val_shape[-len(self_shape) :] == self_shape:
-                    # value has observed values for the Mixture
-                    ndim = len(self_shape)
-                elif comp_shape and val_shape[-len(comp_shape) :] == comp_shape:
-                    # value has observed for the Mixture components
-                    ndim = len(comp_shape)
-                else:
-                    # We cannot infer what was passed, we handle this
-                    # as was done in earlier versions of Mixture. We pad
-                    # always if ndim is lower or equal to 1  (default
-                    # legacy implementation)
-                    if ndim <= 1:
-                        ndim = len(comp_dists.shape) - 1
-            else:
-                # We reach this point if value does not hold observed data, so
-                # we can use its ndim safely to determine shape padding, or it
-                # holds something that we cannot infer, so we revert to using
-                # the value's ndim for shape padding.
-                # We will always pad a single dimension if ndim is lower or
-                # equal to 1 (default legacy implementation)
-                if ndim <= 1:
-                    ndim = len(comp_dists.shape) - 1
-            if ndim < len(comp_dists.shape):
-                value_ = at.shape_padright(value, len(comp_dists.shape) - ndim)
-            else:
-                value_ = value
-            return comp_dists.logp(value_)
+        # Stack components across mixture axis
+        if single_component:
+            # If single component, we consider it as being already "stacked"
+            stacked_components_ = components_[0]
         else:
-            return at.squeeze(
-                at.stack([comp_dist.logp(value) for comp_dist in comp_dists], axis=-1)
-            )
+            stacked_components_ = at.stack(components_, axis=mix_axis)
 
-    def _comp_means(self):
-        try:
-            return at.as_tensor_variable(self.comp_dists.mean)
-        except AttributeError:
-            return at.squeeze(at.stack([comp_dist.mean for comp_dist in self.comp_dists], axis=-1))
-
-    def _comp_modes(self):
-        try:
-            return at.as_tensor_variable(self.comp_dists.mode)
-        except AttributeError:
-            return at.squeeze(at.stack([comp_dist.mode for comp_dist in self.comp_dists], axis=-1))
-
-    def _comp_samples(self, point=None, size=None, comp_dist_shapes=None, broadcast_shape=None):
-        # if self.comp_is_distribution:
-        #     samples = self._comp_dists.random(point=point, size=size)
-        # else:
-        #     if comp_dist_shapes is None:
-        #         comp_dist_shapes = self._comp_dist_shapes
-        #     if broadcast_shape is None:
-        #         broadcast_shape = self._sample_shape
-        #     samples = []
-        #     for dist_shape, generator in zip(comp_dist_shapes, self._generators):
-        #         sample = generate_samples(
-        #             generator=generator,
-        #             dist_shape=dist_shape,
-        #             broadcast_shape=broadcast_shape,
-        #             point=point,
-        #             size=size,
-        #             not_broadcast_kwargs={"raw_size_": size},
-        #         )
-        #         samples.append(sample)
-        #     samples = np.array(broadcast_distribution_samples(samples, size=size))
-        #     # In the logp we assume the last axis holds the mixture components
-        #     # so we move the axis to the last dimension
-        #     samples = np.moveaxis(samples, 0, -1)
-        # return samples.astype(self.dtype)
-        pass
-
-    def infer_comp_dist_shapes(self, point=None):
-        """Try to infer the shapes of the component distributions,
-        `comp_dists`, and how they should broadcast together.
-        The behavior is slightly different if `comp_dists` is a `Distribution`
-        as compared to when it is a list of `Distribution`s. When it is a list
-        the following procedure is repeated for each element in the list:
-        1. Look up the `comp_dists.shape`
-        2. If it is not empty, use it as `comp_dist_shape`
-        3. If it is an empty tuple, a single random sample is drawn by calling
-        `comp_dists.random(point=point, size=None)`, and the returned
-        test_sample's shape is used as the inferred `comp_dists.shape`
+        # Broadcast weights to (*batched dimensions, stack dimension), ignoring support dimensions
+        weights_broadcast_shape_ = stacked_components_.shape[: ndim_batch + 1]
+        weights_broadcasted_ = at.broadcast_to(weights_, weights_broadcast_shape_)
 
-        Parameters
-        ----------
-        point: None or dict (optional)
-            Dictionary that maps rv names to values, to supply to
-            `self.comp_dists.random`
+        # Draw mixture indexes and append (stack + ndim_supp) broadcastable dimensions to the right
+        mix_indexes_ = at.random.categorical(weights_broadcasted_, rng=mix_indexes_rng_)
+        mix_indexes_padded_ = at.shape_padright(mix_indexes_, ndim_supp + 1)
 
-        Returns
-        -------
-        comp_dist_shapes: shape tuple or list of shape tuples.
-            If `comp_dists` is a `Distribution`, it is a shape tuple of the
-            inferred distribution shape.
-            If `comp_dists` is a list of `Distribution`s, it is a list of
-            shape tuples inferred for each element in `comp_dists`
-        broadcast_shape: shape tuple
-            The shape that results from broadcasting all component's shapes
-            together.
-        """
-        # if self.comp_is_distribution:
-        #     if len(self._comp_dist_shapes) > 0:
-        #         comp_dist_shapes = self._comp_dist_shapes
-        #     else:
-        #         # Happens when the distribution is a scalar or when it was not
-        #         # given a shape. In these cases we try to draw a single value
-        #         # to check its shape, we use the provided point dictionary
-        #         # hoping that it can circumvent the Flat and HalfFlat
-        #         # undrawable distributions.
-        #         with _DrawValuesContextBlocker():
-        #             test_sample = self._comp_dists.random(point=point, size=None)
-        #             comp_dist_shapes = test_sample.shape
-        #     broadcast_shape = comp_dist_shapes
-        # else:
-        #     # Now we check the comp_dists distribution shape, see what
-        #     # the broadcast shape would be. This shape will be the dist_shape
-        #     # used by generate samples (the shape of a single random sample)
-        #     # from the mixture
-        #     comp_dist_shapes = []
-        #     for dist_shape, comp_dist in zip(self._comp_dist_shapes, self._comp_dists):
-        #         if dist_shape == tuple():
-        #             # Happens when the distribution is a scalar or when it was
-        #             # not given a shape. In these cases we try to draw a single
-        #             # value to check its shape, we use the provided point
-        #             # dictionary hoping that it can circumvent the Flat and
-        #             # HalfFlat undrawable distributions.
-        #             with _DrawValuesContextBlocker():
-        #                 test_sample = comp_dist.random(point=point, size=None)
-        #                 dist_shape = test_sample.shape
-        #         comp_dist_shapes.append(dist_shape)
-        #     # All component distributions must broadcast with each other
-        #     try:
-        #         broadcast_shape = np.broadcast(
-        #             *[np.empty(shape) for shape in comp_dist_shapes]
-        #         ).shape
-        #     except Exception:
-        #         raise TypeError(
-        #             "Inferred comp_dist shapes do not broadcast "
-        #             "with each other. comp_dists inferred shapes "
-        #             "are: {}".format(comp_dist_shapes)
-        #         )
-        # return comp_dist_shapes, broadcast_shape
+        # Index components and squeeze mixture dimension
+        mix_out_ = at.take_along_axis(stacked_components_, mix_indexes_padded_, axis=mix_axis)
+        # There is a Aeasara bug in squeeze with negative axis
+        # this is equivalent to np.squeeze(mix_out_, axis=mix_axis)
+        mix_out_ = at.squeeze(mix_out_, axis=mix_out_.ndim + mix_axis)
 
-    def logp(self, value):
-        """
-        Calculate log-probability of defined Mixture distribution at specified value.
+        # Output mix_indexes rng update so that it can be updated in place
+        mix_indexes_rng_next_ = mix_indexes_.owner.outputs[0]
 
-        Parameters
-        ----------
-        value: numeric
-            Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or Aesara tensor
-
-        Returns
-        -------
-        TensorVariable
-        """
-        w = self.w
+        mix_op = MarginalMixtureRV(
+            inputs=[mix_indexes_rng_, weights_, *components_],
+            outputs=[mix_indexes_rng_next_, mix_out_],
+        )
 
-        return check_parameters(
-            logsumexp(at.log(w) + self._comp_logp(value), axis=-1, keepdims=False),
-            w >= 0,
-            w <= 1,
-            at.allclose(w.sum(axis=-1), 1),
-            broadcast_conditions=False,
+        # Create the actual MarginalMixture variable
+        mix_indexes_rng_next, mix_out = mix_op(mix_indexes_rng, weights, *components)
+
+        # We need to set_default_updates ourselves, because the choices RV is hidden
+        # inside OpFromGraph and PyMC will never find it otherwise
+        mix_indexes_rng.default_update = mix_indexes_rng_next
+
+        # Reference nodes to facilitate identification in other classmethods
+        mix_out.tag.weights = weights
+        mix_out.tag.components = components
+        mix_out.tag.choices_rng = mix_indexes_rng
+
+        # Component RVs terms are accounted by the Mixture logprob, so they can be
+        # safely ignore by Aeppl (this tag prevents UserWarning)
+        for component in components:
+            component.tag.ignore_logprob = True
+
+        return mix_out
+
+    @classmethod
+    def _reseed_components(cls, rngs, *components):
+        *components_rngs, mix_indexes_rng = rngs
+        assert len(components) == len(components_rngs)
+        new_components = []
+        for component, component_rng in zip(components, components_rngs):
+            component_node = component.owner
+            old_rng, *inputs = component_node.inputs
+            new_components.append(
+                component_node.op.make_node(component_rng, *inputs).default_output()
+            )
+        return new_components
+
+    @classmethod
+    def _resize_components(cls, size, *components):
+        if len(components) == 1:
+            # If we have a single component, we need to keep the length of the mixture
+            # axis intact, because that's what determines the number of mixture components
+            mix_axis = -components[0].owner.op.ndim_supp - 1
+            mix_size = components[0].shape[mix_axis]
+            size = tuple(size) + (mix_size,)
+
+        return [change_rv_size(component, size) for component in components]
+
+    @classmethod
+    def ndim_supp(cls, weights, *components):
+        # We already checked that all components have the same support dimensionality
+        return components[0].owner.op.ndim_supp
+
+    @classmethod
+    def change_size(cls, rv, new_size, expand=False):
+        weights = rv.tag.weights
+        components = rv.tag.components
+        rngs = [component.owner.inputs[0] for component in components] + [rv.tag.choices_rng]
+
+        if expand:
+            component = rv.tag.components[0]
+            # Old size is equal to `shape[:-ndim_supp]`, with care needed for `ndim_supp == 0`
+            size_dims = component.ndim - component.owner.op.ndim_supp
+            if len(rv.tag.components) == 1:
+                # If we have a single component, new size should ignore the mixture axis
+                # dimension, as that is not touched by `_resize_components`
+                size_dims -= 1
+            old_size = components[0].shape[:size_dims]
+            new_size = to_tuple(new_size) + tuple(old_size)
+
+        components = cls._resize_components(new_size, *components)
+
+        return cls.rv_op(weights, *components, rngs=rngs, size=None)
+
+    @classmethod
+    def graph_rvs(cls, rv):
+        # We return rv, which is itself a pseudo RandomVariable, that contains a
+        # mix_indexes_ RV in its inner graph. We want super().dist() to generate
+        # (components + 1) rngs for us, and it will do so based on how many elements
+        # we return here
+        return (*rv.tag.components, rv)
+
+
+@_get_measurable_outputs.register(MarginalMixtureRV)
+def _get_measurable_outputs_MarginalMixtureRV(op, node):
+    # This tells Aeppl that the second output is the measurable one
+    return [node.outputs[1]]
+
+
+@_logprob.register(MarginalMixtureRV)
+def marginal_mixture_logprob(op, values, rng, weights, *components, **kwargs):
+    (value,) = values
+
+    # single component
+    if len(components) == 1:
+        # Need to broadcast value across mixture axis
+        mix_axis = -components[0].owner.op.ndim_supp - 1
+        components_logp = logp(components[0], at.expand_dims(value, mix_axis))
+    else:
+        components_logp = at.stack(
+            [logp(component, value) for component in components],
+            axis=-1,
         )
 
-    def random(self, point=None, size=None):
-        """
-        Draw random values from defined Mixture distribution.
+    mix_logp = at.logsumexp(at.log(weights) + components_logp, axis=-1)
 
-        Parameters
-        ----------
-        point: dict, optional
-            Dict of variable values on which random values are to be
-            conditioned (uses default point if not specified).
-        size: int, optional
-            Desired size of random sample (returns one sample if not
-            specified).
+    # Squeeze stack dimension
+    # There is a Aeasara bug in squeeze with negative axis
+    # mix_logp = at.squeeze(mix_logp, axis=-1)
+    mix_logp = at.squeeze(mix_logp, axis=mix_logp.ndim - 1)
 
-        Returns
-        -------
-        array
-        """
-        # # Convert size to tuple
-        # size = to_tuple(size)
-        # # Draw mixture weights and infer the comp_dists shapes
-        # with _DrawValuesContext() as draw_context:
-        #     # We first need to check w and comp_tmp shapes and re compute size
-        #     w = draw_values([self.w], point=point, size=size)[0]
-        #     comp_dist_shapes, broadcast_shape = self.infer_comp_dist_shapes(point=point)
-        #
-        # # When size is not None, it's hard to tell the w parameter shape
-        # if size is not None and w.shape[: len(size)] == size:
-        #     w_shape = w.shape[len(size) :]
-        # else:
-        #     w_shape = w.shape
-        #
-        # # Try to determine parameter shape and dist_shape
-        # if self.comp_is_distribution:
-        #     param_shape = np.broadcast(np.empty(w_shape), np.empty(broadcast_shape)).shape
-        # else:
-        #     param_shape = np.broadcast(np.empty(w_shape), np.empty(broadcast_shape + (1,))).shape
-        # if np.asarray(self.shape).size != 0:
-        #     dist_shape = np.broadcast(np.empty(self.shape), np.empty(param_shape[:-1])).shape
-        # else:
-        #     dist_shape = param_shape[:-1]
-        #
-        # # Try to determine the size that must be used to get the mixture
-        # # components (i.e. get random choices using w).
-        # # 1. There must be size independent choices based on w.
-        # # 2. There must also be independent draws for each non singleton axis
-        # # of w.
-        # # 3. There must also be independent draws for each dimension added by
-        # # self.shape with respect to the w.ndim. These usually correspond to
-        # # observed variables with batch shapes
-        # wsh = (1,) * (len(dist_shape) - len(w_shape) + 1) + w_shape[:-1]
-        # psh = (1,) * (len(dist_shape) - len(param_shape) + 1) + param_shape[:-1]
-        # w_sample_size = []
-        # # Loop through the dist_shape to get the conditions 2 and 3 first
-        # for i in range(len(dist_shape)):
-        #     if dist_shape[i] != psh[i] and wsh[i] == 1:
-        #         # self.shape[i] is a non singleton dimension (usually caused by
-        #         # observed data)
-        #         sh = dist_shape[i]
-        #     else:
-        #         sh = wsh[i]
-        #     w_sample_size.append(sh)
-        # if size is not None and w_sample_size[: len(size)] != size:
-        #     w_sample_size = size + tuple(w_sample_size)
-        # # Broadcast w to the w_sample_size (add a singleton last axis for the
-        # # mixture components)
-        # w = broadcast_distribution_samples([w, np.empty(w_sample_size + (1,))], size=size)[0]
-        #
-        # # Semiflatten the mixture weights. The last axis is the number of
-        # # mixture mixture components, and the rest is all about size,
-        # # dist_shape and broadcasting
-        # w_ = np.reshape(w, (-1, w.shape[-1]))
-        # w_samples = random_choice(p=w_, size=None)  # w's shape already includes size
-        # # Now we broadcast the chosen components to the dist_shape
-        # w_samples = np.reshape(w_samples, w.shape[:-1])
-        # if size is not None and dist_shape[: len(size)] != size:
-        #     w_samples = np.broadcast_to(w_samples, size + dist_shape)
-        # else:
-        #     w_samples = np.broadcast_to(w_samples, dist_shape)
-        #
-        # # When size is not None, maybe dist_shape partially overlaps with size
-        # if size is not None:
-        #     if size == dist_shape:
-        #         size = None
-        #     elif size[-len(dist_shape) :] == dist_shape:
-        #         size = size[: len(size) - len(dist_shape)]
-        #
-        # # We get an integer _size instead of a tuple size for drawing the
-        # # mixture, then we just reshape the output
-        # if size is None:
-        #     _size = None
-        # else:
-        #     _size = int(np.prod(size))
-        #
-        # # Compute the total size of the mixture's random call with size
-        # if _size is not None:
-        #     output_size = int(_size * np.prod(dist_shape) * param_shape[-1])
-        # else:
-        #     output_size = int(np.prod(dist_shape) * param_shape[-1])
-        # # Get the size we need for the mixture's random call
-        # if self.comp_is_distribution:
-        #     mixture_size = int(output_size // np.prod(broadcast_shape))
-        # else:
-        #     mixture_size = int(output_size // (np.prod(broadcast_shape) * param_shape[-1]))
-        # if mixture_size == 1 and _size is None:
-        #     mixture_size = None
-        #
-        # # Sample from the mixture
-        # with draw_context:
-        #     mixed_samples = self._comp_samples(
-        #         point=point,
-        #         size=mixture_size,
-        #         broadcast_shape=broadcast_shape,
-        #         comp_dist_shapes=comp_dist_shapes,
-        #     )
-        # # Test that the mixture has the same number of "samples" as w
-        # if w_samples.size != (mixed_samples.size // w.shape[-1]):
-        #     raise ValueError(
-        #         "Inconsistent number of samples from the "
-        #         "mixture and mixture weights. Drew {} mixture "
-        #         "weights elements, and {} samples from the "
-        #         "mixture components.".format(w_samples.size, mixed_samples.size // w.shape[-1])
-        #     )
-        # # Semiflatten the mixture to be able to zip it with w_samples
-        # w_samples = w_samples.flatten()
-        # mixed_samples = np.reshape(mixed_samples, (-1, w.shape[-1]))
-        # # Select the samples from the mixture
-        # samples = np.array([mixed[choice] for choice, mixed in zip(w_samples, mixed_samples)])
-        # # Reshape the samples to the correct output shape
-        # if size is None:
-        #     samples = np.reshape(samples, dist_shape)
-        # else:
-        #     samples = np.reshape(samples, size + dist_shape)
-        # return samples
+    mix_logp = check_parameters(
+        mix_logp,
+        0 <= weights,
+        weights <= 1,
+        at.isclose(at.sum(weights, axis=-1), 1),
+        msg="0 <= weights <= 1, sum(weights) == 1",
+    )
 
-    def _distr_parameters_for_repr(self):
-        return []
+    return mix_logp
 
 
 class NormalMixture(Mixture):
diff --git a/pymc/tests/test_mixture.py b/pymc/tests/test_mixture.py
index bb9a8801c5..ecbaf5b52d 100644
--- a/pymc/tests/test_mixture.py
+++ b/pymc/tests/test_mixture.py
@@ -12,6 +12,8 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+from contextlib import ExitStack as does_not_raise
+
 import aesara
 import numpy as np
 import pytest
@@ -25,6 +27,7 @@
 from pymc.distributions import (
     Categorical,
     Dirichlet,
+    DirichletMultinomial,
     Exponential,
     Gamma,
     HalfNormal,
@@ -38,18 +41,22 @@
     NormalMixture,
     Poisson,
 )
+from pymc.distributions.logprob import logp
 from pymc.distributions.shape_utils import to_tuple
 from pymc.math import expand_packed_triangular
 from pymc.model import Model
-from pymc.sampling import sample, sample_posterior_predictive, sample_prior_predictive
+from pymc.sampling import (
+    draw,
+    sample,
+    sample_posterior_predictive,
+    sample_prior_predictive,
+)
 from pymc.step_methods import Metropolis
 from pymc.tests.helpers import SeededTest
 from pymc.tests.test_distributions import Domain, Simplex
 from pymc.tests.test_distributions_random import pymc_random
 
-pytestmark = pytest.mark.xfail(reason="Mixture not refactored.")
 
-# Generate data
 def generate_normal_mixture_data(w, mu, sd, size=1000):
     component = np.random.choice(w.size, size=size, p=w)
     mu, sd = np.broadcast_arrays(mu, sd)
@@ -75,76 +82,318 @@ def generate_poisson_mixture_data(w, mu, size=1000):
 
 
 class TestMixture(SeededTest):
-    @classmethod
-    def setup_class(cls):
-        super().setup_class()
-
-        cls.norm_w = np.array([0.75, 0.25])
-        cls.norm_mu = np.array([0.0, 5.0])
-        cls.norm_sd = np.ones_like(cls.norm_mu)
-        cls.norm_x = generate_normal_mixture_data(cls.norm_w, cls.norm_mu, cls.norm_sd, size=1000)
-
-        cls.pois_w = np.array([0.4, 0.6])
-        cls.pois_mu = np.array([5.0, 20.0])
-        cls.pois_x = generate_poisson_mixture_data(cls.pois_w, cls.pois_mu, size=1000)
-
-    def test_dimensions(self):
-        a1 = Normal.dist(mu=0, sigma=1)
-        a2 = Normal.dist(mu=10, sigma=1)
-        mix = Mixture.dist(w=np.r_[0.5, 0.5], comp_dists=[a1, a2])
-
-        assert mix.mode.ndim == 0
-        assert mix.logp(0.0).ndim == 0
-
-        value = np.r_[0.0, 1.0, 2.0]
-        assert mix.logp(value).ndim == 1
-
-    def test_mixture_list_of_normals(self):
-        with Model() as model:
-            w = Dirichlet("w", floatX(np.ones_like(self.norm_w)), shape=self.norm_w.size)
-            mu = Normal("mu", 0.0, 10.0, shape=self.norm_w.size)
-            tau = Gamma("tau", 1.0, 1.0, shape=self.norm_w.size)
-            Mixture(
-                "x_obs",
-                w,
-                [Normal.dist(mu[0], tau=tau[0]), Normal.dist(mu[1], tau=tau[1])],
-                observed=self.norm_x,
+    def get_inital_point(self, model):
+        """Get initial point with untransformed variables for posterior predictive sampling"""
+        return {
+            var.name: initial_point
+            for var, initial_point in zip(
+                model.unobserved_value_vars,
+                model.compile_fn(model.unobserved_value_vars)(model.compute_initial_point()),
             )
-            step = Metropolis()
-            trace = sample(5000, step, random_seed=self.random_seed, progressbar=False, chains=1)
+        }
 
-        assert_allclose(np.sort(trace["w"].mean(axis=0)), np.sort(self.norm_w), rtol=0.1, atol=0.1)
-        assert_allclose(
-            np.sort(trace["mu"].mean(axis=0)), np.sort(self.norm_mu), rtol=0.1, atol=0.1
-        )
+    @pytest.mark.parametrize(
+        "weights",
+        [
+            np.array([1, 0]),
+            np.array([[1, 0], [0, 1], [1, 0]]),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "component",
+        [
+            Normal.dist([-10, 10]),
+            Normal.dist([-10, 10], size=(3, 2)),
+            Normal.dist([[-15, 15], [-10, 10], [-5, 5]], 1e-3),
+            Normal.dist([-10, 10], size=(4, 3, 2)),
+        ],
+    )
+    @pytest.mark.parametrize("size", [None, (3,), (4, 3)])
+    def test_single_univariate_component_deterministic_weights(self, weights, component, size):
+        # Size can't be smaller than what is implied by replication dimensions
+        if size is not None and len(size) < max(component.ndim - 1, weights.ndim - 1):
+            return
+
+        mix = Mixture.dist(weights, component, size=size)
+        mix_eval = mix.eval()
+
+        # Test shape
+        # component shape is either (4, 3, 2), (3, 2) or (2,)
+        # weights shape is either (3, 2) or (2,)
+        if size is not None:
+            expected_shape = size
+        elif component.ndim == 3:
+            expected_shape = (4, 3)
+        elif component.ndim == 2 or weights.ndim == 2:
+            expected_shape = (3,)
+        else:
+            expected_shape = ()
+        assert mix_eval.shape == expected_shape
+
+        # Test draws
+        expected_positive = np.zeros_like(mix_eval)
+        if expected_positive.ndim > 0:
+            expected_positive[..., :] = (weights == 1)[..., 1]
+        assert np.all((mix_eval > 0) == expected_positive)
+        repetitions = np.unique(mix_eval).size < mix_eval.size
+        assert not repetitions
+
+        # Test logp
+        mix_logp_eval = logp(mix, mix_eval).eval()
+        assert mix_logp_eval.shape == expected_shape
+        bcast_weights = np.broadcast_to(weights, (*expected_shape, 2))
+        expected_logp = logp(component, mix_eval[..., None]).eval()[bcast_weights == 1]
+        expected_logp = expected_logp.reshape(expected_shape)
+        assert np.allclose(mix_logp_eval, expected_logp)
 
-    def test_poisson_mixture(self):
-        with Model() as model:
-            w = Dirichlet("w", floatX(np.ones_like(self.pois_w)), shape=self.pois_w.shape)
-            mu = Gamma("mu", 1.0, 1.0, shape=self.pois_w.size)
-            Mixture("x_obs", w, Poisson.dist(mu), observed=self.pois_x)
-            step = Metropolis()
-            trace = sample(5000, step, random_seed=self.random_seed, progressbar=False, chains=1)
+    @pytest.mark.parametrize(
+        "weights",
+        [
+            np.array([1, 0]),
+            np.array([[1, 0], [0, 1], [1, 0]]),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "components",
+        [
+            (Normal.dist(-10, 1e-3), Normal.dist(10, 1e-3)),
+            (Normal.dist(-10, 1e-3, size=(3,)), Normal.dist(10, 1e-3, size=(3,))),
+            (Normal.dist([-15, -10, -5], 1e-3), Normal.dist([15, 10, 5], 1e-3)),
+            (Normal.dist(-10, 1e-3, size=(4, 3)), Normal.dist(10, 1e-3, size=(4, 3))),
+        ],
+    )
+    @pytest.mark.parametrize("size", [None, (3,), (4, 3)])
+    def test_list_univariate_components_deterministic_weights(self, weights, components, size):
+        # Size can't be smaller than what is implied by replication dimensions
+        if size is not None and len(size) < max(components[0].ndim, weights.ndim - 1):
+            return
+
+        mix = Mixture.dist(weights, components, size=size)
+        mix_eval = mix.eval()
+
+        # Test shape
+        # components[0] shape is either (4, 3), (3,) or ()
+        # weights shape is either (3, 2) or (2,)
+        if size is not None:
+            expected_shape = size
+        elif components[0].ndim == 2:
+            expected_shape = (4, 3)
+        elif components[0].ndim == 1 or weights.ndim == 2:
+            expected_shape = (3,)
+        else:
+            expected_shape = ()
+        assert mix_eval.shape == expected_shape
+
+        # Test draws
+        expected_positive = np.zeros_like(mix_eval)
+        if expected_positive.ndim > 0:
+            expected_positive[..., :] = (weights == 1)[..., 1]
+        assert np.all((mix_eval > 0) == expected_positive)
+        repetitions = np.unique(mix_eval).size < mix_eval.size
+        assert not repetitions
+
+        # Test logp
+        mix_logp_eval = logp(mix, mix_eval).eval()
+        assert mix_logp_eval.shape == expected_shape
+        bcast_weights = np.broadcast_to(weights, (*expected_shape, 2))
+        expected_logp = np.stack(
+            (
+                logp(components[0], mix_eval).eval(),
+                logp(components[1], mix_eval).eval(),
+            ),
+            axis=-1,
+        )[bcast_weights == 1]
+        expected_logp = expected_logp.reshape(expected_shape)
+        assert np.allclose(mix_logp_eval, expected_logp)
 
-        assert_allclose(np.sort(trace["w"].mean(axis=0)), np.sort(self.pois_w), rtol=0.1, atol=0.1)
-        assert_allclose(
-            np.sort(trace["mu"].mean(axis=0)), np.sort(self.pois_mu), rtol=0.1, atol=0.1
-        )
+    @pytest.mark.parametrize(
+        "weights",
+        [
+            np.array([1, 0]),
+            np.array([[1, 0], [0, 1], [1, 0], [0, 1]]),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "component",
+        [
+            DirichletMultinomial.dist(n=[5_000, 10_000], a=np.ones((3,))),
+            DirichletMultinomial.dist(n=[5_000, 10_000], a=np.ones((3,)), size=(4, 2)),
+        ],
+    )
+    @pytest.mark.parametrize("size", [None, (4,), (5, 4)])
+    def test_single_multivariate_component_deterministic_weights(self, weights, component, size):
+        # This test needs seeding to avoid repetitions
+        rngs = [
+            aesara.shared(np.random.default_rng(seed))
+            for seed in self.get_random_state().randint(2**30, size=2)
+        ]
+        mix = Mixture.dist(weights, component, size=size, rngs=rngs)
+        mix_eval = mix.eval()
+
+        # Test shape
+        # component shape is either (4, 2, 3), (2, 3)
+        # weights shape is either (4, 2) or (2,)
+        if size is not None:
+            expected_shape = size + (3,)
+        elif component.ndim == 3 or weights.ndim == 2:
+            expected_shape = (4, 3)
+        else:
+            expected_shape = (3,)
+        assert mix_eval.shape == expected_shape
+
+        # Test draws
+        totals = mix_eval.sum(-1)
+        expected_large_count = (weights == 1)[..., 1]
+        assert np.all((totals == 10_000) == expected_large_count)
+        repetitions = np.unique(mix_eval[..., 0]).size < totals.size
+        assert not repetitions
+
+        # Test logp
+        mix_logp_eval = logp(mix, mix_eval).eval()
+        expected_logp_shape = expected_shape[:-1]
+        assert mix_logp_eval.shape == expected_logp_shape
+        bcast_weights = np.broadcast_to(weights, (*expected_logp_shape, 2))
+        expected_logp = logp(component, mix_eval[..., None, :]).eval()[bcast_weights == 1]
+        expected_logp = expected_logp.reshape(expected_logp_shape)
+        assert np.allclose(mix_logp_eval, expected_logp)
 
-    def test_mixture_list_of_poissons(self):
-        with Model() as model:
-            w = Dirichlet("w", floatX(np.ones_like(self.pois_w)), shape=self.pois_w.shape)
-            mu = Gamma("mu", 1.0, 1.0, shape=self.pois_w.size)
-            Mixture("x_obs", w, [Poisson.dist(mu[0]), Poisson.dist(mu[1])], observed=self.pois_x)
-            step = Metropolis()
-            trace = sample(5000, step, random_seed=self.random_seed, progressbar=False, chains=1)
+    @pytest.mark.parametrize(
+        "weights",
+        [
+            np.array([1, 0]),
+            np.array([[1, 0], [0, 1], [1, 0], [0, 1]]),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "components",
+        [
+            (
+                MvNormal.dist([-15, -10, -5], np.eye(3) * 1e-3),
+                MvNormal.dist([15, 10, 5], np.eye(3) * 1e-3),
+            ),
+            (
+                MvNormal.dist([-15, -10, -5], np.eye(3) * 1e-3, size=(4,)),
+                MvNormal.dist([15, 10, 5], np.eye(3) * 1e-3, size=(4,)),
+            ),
+        ],
+    )
+    @pytest.mark.parametrize("size", [None, (4,), (5, 4)])
+    def test_list_multivariate_components_deterministic_weights(self, weights, components, size):
+        mix = Mixture.dist(weights, components, size=size)
+        mix_eval = mix.eval()
+
+        # Test shape
+        # components[0] shape is either (4, 3) or (3,)
+        # weights shape is either (4, 2) or (2,)
+        if size is not None:
+            expected_shape = size + (3,)
+        elif components[0].ndim == 2 or weights.ndim == 2:
+            expected_shape = (4, 3)
+        else:
+            expected_shape = (3,)
+        assert mix_eval.shape == expected_shape
+
+        # Test draws
+        expected_positive = np.zeros_like(mix_eval)
+        expected_positive[..., :] = (weights == 1)[..., 1, None]
+        assert np.all((mix_eval > 0) == expected_positive)
+        repetitions = np.unique(mix_eval).size < mix_eval.size
+        assert not repetitions
+
+        # Test logp
+        # MvNormal logp is currently limited to 2d values
+        expectation = pytest.raises(ValueError) if mix_eval.ndim > 2 else does_not_raise()
+        with expectation:
+            mix_logp_eval = logp(mix, mix_eval).eval()
+            assert mix_logp_eval.shape == expected_shape[:-1]
+            bcast_weights = np.broadcast_to(weights, (*expected_shape[:-1], 2))
+            expected_logp = np.stack(
+                (
+                    logp(components[0], mix_eval).eval(),
+                    logp(components[1], mix_eval).eval(),
+                ),
+                axis=-1,
+            )[bcast_weights == 1]
+            expected_logp = expected_logp.reshape(expected_shape[:-1])
+            assert np.allclose(mix_logp_eval, expected_logp)
+
+    def test_component_choice_random(self):
+        """Test that mixture choices change over evaluations"""
+        with Model() as m:
+            weights = [0.5, 0.5]
+            components = [Normal.dist(-10, 0.01), Normal.dist(10, 0.01)]
+            mix = Mixture.dist(weights, components)
+        draws = draw(mix, draws=10)
+        # Probability of coming from same component 10 times is 0.5**10
+        assert np.unique(draws > 0).size == 2
 
-        assert_allclose(np.sort(trace["w"].mean(axis=0)), np.sort(self.pois_w), rtol=0.1, atol=0.1)
-        assert_allclose(
-            np.sort(trace["mu"].mean(axis=0)), np.sort(self.pois_mu), rtol=0.1, atol=0.1
+    @pytest.mark.parametrize(
+        "comp_dists",
+        (
+            [Normal.dist(size=(2,))],
+            [Normal.dist(), Normal.dist()],
+            [MvNormal.dist(np.ones(3), np.eye(3), size=(2,))],
+            [
+                MvNormal.dist(np.ones(3), np.eye(3)),
+                MvNormal.dist(np.ones(3), np.eye(3)),
+            ],
+        ),
+    )
+    def test_components_expanded_by_weights(self, comp_dists):
+        """Test that components are expanded when size or weights are larger than components"""
+        univariate = comp_dists[0].owner.op.ndim_supp == 0
+
+        mix = Mixture.dist(
+            w=Dirichlet.dist([1, 1], shape=(3, 2)),
+            comp_dists=comp_dists,
+            size=(3,),
+        )
+        draws = mix.eval()
+        assert draws.shape == (3,) if univariate else (3, 3)
+        assert np.unique(draws).size == draws.size
+
+        mix = Mixture.dist(
+            w=Dirichlet.dist([1, 1], shape=(4, 3, 2)),
+            comp_dists=comp_dists,
+            size=(3,),
         )
+        draws = mix.eval()
+        assert draws.shape == (4, 3) if univariate else (4, 3, 3)
+        assert np.unique(draws).size == draws.size
 
-    def test_mixture_of_mvn(self):
+    @pytest.mark.parametrize(
+        "comp_dists",
+        (
+            [Normal.dist(size=(2,))],
+            [Normal.dist(), Normal.dist()],
+            [MvNormal.dist(np.ones(3), np.eye(3), size=(2,))],
+            [
+                MvNormal.dist(np.ones(3), np.eye(3)),
+                MvNormal.dist(np.ones(3), np.eye(3)),
+            ],
+        ),
+    )
+    @pytest.mark.parametrize("expand", (False, True))
+    def test_change_size(self, comp_dists, expand):
+        univariate = comp_dists[0].owner.op.ndim_supp == 0
+
+        mix = Mixture.dist(w=Dirichlet.dist([1, 1]), comp_dists=comp_dists)
+        mix = Mixture.change_size(mix, new_size=(4,), expand=expand)
+        draws = mix.eval()
+        expected_shape = (4,) if univariate else (4, 3)
+        assert draws.shape == expected_shape
+        assert np.unique(draws).size == draws.size
+
+        mix = Mixture.dist(w=Dirichlet.dist([1, 1]), comp_dists=comp_dists, size=(3,))
+        mix = Mixture.change_size(mix, new_size=(5, 4), expand=expand)
+        draws = mix.eval()
+        expected_shape = (5, 4) if univariate else (5, 4, 3)
+        if expand:
+            expected_shape = expected_shape + (3,)
+        assert draws.shape == expected_shape
+        assert np.unique(draws).size == draws.size
+
+    def test_list_mvnormals_logp(self):
         mu1 = np.asarray([0.0, 1.0])
         cov1 = np.diag([1.5, 2.5])
         mu2 = np.asarray([1.0, 0.0])
@@ -163,22 +412,170 @@ def test_mixture_of_mvn(self):
                 st.multivariate_normal.logpdf(obs, mu2, cov2),
             )
         ).T
-        complogp = y.distribution._comp_logp(aesara.shared(obs)).eval()
-        assert_allclose(complogp, complogp_st)
 
         # check logp of mixture
         testpoint = model.compute_initial_point()
         mixlogp_st = logsumexp(np.log(testpoint["w"]) + complogp_st, axis=-1, keepdims=False)
-        assert_allclose(y.logp_elemwise(testpoint), mixlogp_st)
+        assert_allclose(model.compile_logp(y, sum=False)(testpoint)[0], mixlogp_st)
 
         # check logp of model
         priorlogp = st.dirichlet.logpdf(
             x=testpoint["w"],
             alpha=np.ones(2),
         )
-        assert_allclose(model.logp(testpoint), mixlogp_st.sum() + priorlogp)
+        assert_allclose(model.compile_logp()(testpoint), mixlogp_st.sum() + priorlogp)
+
+    def test_single_poisson_sampling(self):
+        pois_w = np.array([0.4, 0.6])
+        pois_mu = np.array([5.0, 20.0])
+        pois_x = generate_poisson_mixture_data(pois_w, pois_mu, size=1000)
+
+        with Model() as model:
+            w = Dirichlet("w", floatX(np.ones_like(pois_w)), shape=pois_w.shape)
+            mu = Gamma("mu", 1.0, 1.0, shape=pois_w.size)
+            Mixture("x_obs", w, Poisson.dist(mu), observed=pois_x)
+            step = Metropolis()
+            trace = sample(
+                5000,
+                step,
+                random_seed=self.random_seed,
+                progressbar=False,
+                chains=1,
+                return_inferencedata=False,
+            )
+
+        assert_allclose(np.sort(trace["w"].mean(axis=0)), np.sort(pois_w), rtol=0.1, atol=0.1)
+        assert_allclose(np.sort(trace["mu"].mean(axis=0)), np.sort(pois_mu), rtol=0.1, atol=0.1)
+
+    def test_list_poissons_sampling(self):
+        pois_w = np.array([0.4, 0.6])
+        pois_mu = np.array([5.0, 20.0])
+        pois_x = generate_poisson_mixture_data(pois_w, pois_mu, size=1000)
+
+        with Model() as model:
+            w = Dirichlet("w", floatX(np.ones_like(pois_w)), shape=pois_w.shape)
+            mu = Gamma("mu", 1.0, 1.0, shape=pois_w.size)
+            Mixture("x_obs", w, [Poisson.dist(mu[0]), Poisson.dist(mu[1])], observed=pois_x)
+            trace = sample(
+                5000,
+                chains=1,
+                step=Metropolis(),
+                random_seed=self.random_seed,
+                progressbar=False,
+                return_inferencedata=False,
+            )
+
+        assert_allclose(np.sort(trace["w"].mean(axis=0)), np.sort(pois_w), rtol=0.1, atol=0.1)
+        assert_allclose(np.sort(trace["mu"].mean(axis=0)), np.sort(pois_mu), rtol=0.1, atol=0.1)
+
+    def test_list_normals_sampling(self):
+        norm_w = np.array([0.75, 0.25])
+        norm_mu = np.array([0.0, 5.0])
+        norm_sd = np.ones_like(norm_mu)
+        norm_x = generate_normal_mixture_data(norm_w, norm_mu, norm_sd, size=1000)
+
+        with Model() as model:
+            w = Dirichlet("w", floatX(np.ones_like(norm_w)), shape=norm_w.size)
+            mu = Normal("mu", 0.0, 10.0, shape=norm_w.size)
+            tau = Gamma("tau", 1.0, 1.0, shape=norm_w.size)
+            Mixture(
+                "x_obs",
+                w,
+                [Normal.dist(mu[0], tau=tau[0]), Normal.dist(mu[1], tau=tau[1])],
+                observed=norm_x,
+            )
+            trace = sample(
+                5000,
+                chains=1,
+                step=Metropolis(),
+                random_seed=self.random_seed,
+                progressbar=False,
+                return_inferencedata=False,
+            )
+
+        assert_allclose(np.sort(trace["w"].mean(axis=0)), np.sort(norm_w), rtol=0.1, atol=0.1)
+        assert_allclose(np.sort(trace["mu"].mean(axis=0)), np.sort(norm_mu), rtol=0.1, atol=0.1)
+
+    def test_single_poisson_predictive_sampling_shape(self):
+        # test the shape broadcasting in mixture random
+        rng = self.get_random_state()
+        y = np.concatenate([rng.poisson(5, size=10), rng.poisson(9, size=10)])
+        with Model() as model:
+            comp0 = Poisson.dist(mu=np.ones(2))
+            w0 = Dirichlet("w0", a=np.ones(2), shape=(2,))
+            like0 = Mixture("like0", w=w0, comp_dists=comp0, observed=y)
+
+            comp1 = Poisson.dist(mu=np.ones((20, 2)), shape=(20, 2))
+            w1 = Dirichlet("w1", a=np.ones(2), shape=(2,))
+            like1 = Mixture("like1", w=w1, comp_dists=comp1, observed=y)
+
+            comp2 = Poisson.dist(mu=np.ones(2))
+            w2 = Dirichlet("w2", a=np.ones(2), shape=(20, 2))
+            like2 = Mixture("like2", w=w2, comp_dists=comp2, observed=y)
+
+            comp3 = Poisson.dist(mu=np.ones(2), shape=(20, 2))
+            w3 = Dirichlet("w3", a=np.ones(2), shape=(20, 2))
+            like3 = Mixture("like3", w=w3, comp_dists=comp3, observed=y)
+
+        n_samples = 30
+        with model:
+            prior = sample_prior_predictive(samples=n_samples, return_inferencedata=False)
+            ppc = sample_posterior_predictive(
+                [self.get_inital_point(model)], samples=n_samples, return_inferencedata=False
+            )
+
+        assert prior["like0"].shape == (n_samples, 20)
+        assert prior["like1"].shape == (n_samples, 20)
+        assert prior["like2"].shape == (n_samples, 20)
+        assert prior["like3"].shape == (n_samples, 20)
+
+        assert ppc["like0"].shape == (n_samples, 20)
+        assert ppc["like1"].shape == (n_samples, 20)
+        assert ppc["like2"].shape == (n_samples, 20)
+        assert ppc["like3"].shape == (n_samples, 20)
+
+    def test_list_mvnormals_predictive_sampling_shape(self):
+        N = 100  # number of data points
+        K = 3  # number of mixture components
+        D = 3  # dimensionality of the data
+        X = MvNormal.dist(np.zeros(D), np.eye(D), size=N).eval()
+
+        with Model() as model:
+            pi = Dirichlet("pi", np.ones(K), shape=(K,))
+
+            comp_dist = []
+            mu = []
+            packed_chol = []
+            chol = []
+            for i in range(K):
+                mu.append(Normal(f"mu{i}", 0, 10, shape=D))
+                packed_chol.append(
+                    LKJCholeskyCov(
+                        f"chol_cov_{i}",
+                        eta=2,
+                        n=D,
+                        sd_dist=HalfNormal.dist(2.5, size=D),
+                        compute_corr=False,
+                    )
+                )
+                chol.append(expand_packed_triangular(D, packed_chol[i], lower=True))
+                comp_dist.append(MvNormal.dist(mu=mu[i], chol=chol[i], shape=D))
+
+            Mixture("x_obs", pi, comp_dist, observed=X)
 
-    def test_mixture_of_mixture(self):
+        n_samples = 20
+        with model:
+            prior = sample_prior_predictive(samples=n_samples, return_inferencedata=False)
+            ppc = sample_posterior_predictive(
+                [self.get_inital_point(model)], samples=n_samples, return_inferencedata=False
+            )
+        assert ppc["x_obs"].shape == (n_samples,) + X.shape
+        assert prior["x_obs"].shape == (n_samples,) + X.shape
+        assert prior["mu0"].shape == (n_samples, D)
+        assert prior["chol_cov_0"].shape == (n_samples, D * (D + 1) // 2)
+
+    @pytest.mark.xfail(reason="Mixture from single component not refactored yet")
+    def test_nested_mixture(self):
         if aesara.config.floatX == "float32":
             rtol = 1e-4
         else:
@@ -252,92 +649,6 @@ def mixmixlogp(value, point):
         assert_allclose(mixmixlogpg, mix.logp_elemwise(test_point), rtol=rtol)
         assert_allclose(priorlogp + mixmixlogpg.sum(), model.logp(test_point), rtol=rtol)
 
-    def test_sample_prior_and_posterior(self):
-        def build_toy_dataset(N, K):
-            pi = np.array([0.2, 0.5, 0.3])
-            mus = [[1, 1, 1], [-1, -1, -1], [2, -2, 0]]
-            stds = [[0.1, 0.1, 0.1], [0.1, 0.2, 0.2], [0.2, 0.3, 0.3]]
-            x = np.zeros((N, 3), dtype=np.float32)
-            y = np.zeros((N,), dtype=np.int)
-            for n in range(N):
-                k = np.argmax(np.random.multinomial(1, pi))
-                x[n, :] = np.random.multivariate_normal(mus[k], np.diag(stds[k]))
-                y[n] = k
-            return x, y
-
-        N = 100  # number of data points
-        K = 3  # number of mixture components
-        D = 3  # dimensionality of the data
-
-        X, y = build_toy_dataset(N, K)
-
-        with Model() as model:
-            pi = Dirichlet("pi", np.ones(K), shape=(K,))
-
-            comp_dist = []
-            mu = []
-            packed_chol = []
-            chol = []
-            for i in range(K):
-                mu.append(Normal("mu%i" % i, 0, 10, shape=D))
-                packed_chol.append(
-                    LKJCholeskyCov(
-                        "chol_cov_%i" % i, eta=2, n=D, sd_dist=HalfNormal.dist(2.5, size=D)
-                    )
-                )
-                chol.append(expand_packed_triangular(D, packed_chol[i], lower=True))
-                comp_dist.append(MvNormal.dist(mu=mu[i], chol=chol[i], shape=D))
-
-            Mixture("x_obs", pi, comp_dist, observed=X)
-        with model:
-            idata = sample(30, tune=10, chains=1)
-
-        n_samples = 20
-        with model:
-            ppc = sample_posterior_predictive(idata, n_samples)
-            prior = sample_prior_predictive(samples=n_samples)
-        assert ppc["x_obs"].shape == (n_samples,) + X.shape
-        assert prior["x_obs"].shape == (n_samples,) + X.shape
-        assert prior["mu0"].shape == (n_samples, D)
-        assert prior["chol_cov_0"].shape == (n_samples, D * (D + 1) // 2)
-
-    @pytest.mark.xfail(reason="This distribution has not been refactored for v4")
-    def test_mixture_random_shape(self):
-        # test the shape broadcasting in mixture random
-        y = np.concatenate([np.random.poisson(5, size=10), np.random.poisson(9, size=10)])
-        with pm.Model() as m:
-            comp0 = pm.Poisson.dist(mu=np.ones(2))
-            w0 = pm.Dirichlet("w0", a=np.ones(2), shape=(2,))
-            like0 = pm.Mixture("like0", w=w0, comp_dists=comp0, observed=y)
-
-            comp1 = pm.Poisson.dist(mu=np.ones((20, 2)), shape=(20, 2))
-            w1 = pm.Dirichlet("w1", a=np.ones(2), shape=(2,))
-            like1 = pm.Mixture("like1", w=w1, comp_dists=comp1, observed=y)
-
-            comp2 = pm.Poisson.dist(mu=np.ones(2))
-            w2 = pm.Dirichlet("w2", a=np.ones(2), shape=(20, 2))
-            like2 = pm.Mixture("like2", w=w2, comp_dists=comp2, observed=y)
-
-            comp3 = pm.Poisson.dist(mu=np.ones(2), shape=(20, 2))
-            w3 = pm.Dirichlet("w3", a=np.ones(2), shape=(20, 2))
-            like3 = pm.Mixture("like3", w=w3, comp_dists=comp3, observed=y)
-
-        # XXX: This needs to be refactored
-        rand0, rand1, rand2, rand3 = [None] * 4  # draw_values(
-        #     [like0, like1, like2, like3], point=m.initial_point, size=100
-        # )
-        assert rand0.shape == (100, 20)
-        assert rand1.shape == (100, 20)
-        assert rand2.shape == (100, 20)
-        assert rand3.shape == (100, 20)
-
-        with m:
-            ppc = pm.sample_posterior_predictive([m.compute_initial_point()], samples=200)
-        assert ppc["like0"].shape == (200, 20)
-        assert ppc["like1"].shape == (200, 20)
-        assert ppc["like2"].shape == (200, 20)
-        assert ppc["like3"].shape == (200, 20)
-
 
 class TestNormalMixture(SeededTest):
     @classmethod
@@ -465,6 +776,7 @@ def ref_rand(size, w, mu, sigma):
         )
 
 
+@pytest.mark.xfail(reason="NormalMixture not refactored yet")
 class TestMixtureVsLatent(SeededTest):
     def setup_method(self, *args, **kwargs):
         super().setup_method(*args, **kwargs)
@@ -573,6 +885,7 @@ def logp_matches(self, mixture, latent_mix, z, npop, model):
         assert_allclose(mix_logp, latent_mix_logp, rtol=rtol)
 
 
+@pytest.mark.xfail(reason="MixtureSameFamily not refactored yet")
 class TestMixtureSameFamily(SeededTest):
     @classmethod
     def setup_class(cls):

From 3531ab7a63eb31486623dfb0c0c391a649ff69c5 Mon Sep 17 00:00:00 2001
From: Ricardo <ricardo.vieira1994@gmail.com>
Date: Wed, 2 Mar 2022 13:58:27 +0100
Subject: [PATCH 07/13] Add warning when using iterable with single Mixture
 component

---
 pymc/distributions/mixture.py | 8 ++++++++
 pymc/tests/test_mixture.py    | 9 +++++++++
 2 files changed, 17 insertions(+)

diff --git a/pymc/distributions/mixture.py b/pymc/distributions/mixture.py
index bd24748d73..60367830e7 100644
--- a/pymc/distributions/mixture.py
+++ b/pymc/distributions/mixture.py
@@ -11,6 +11,8 @@
 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
+import warnings
+
 import aesara
 import aesara.tensor as at
 import numpy as np
@@ -129,6 +131,12 @@ def dist(cls, w, comp_dists, **kwargs):
         if not isinstance(comp_dists, (tuple, list)):
             # comp_dists is a single component
             comp_dists = [comp_dists]
+        elif len(comp_dists) == 1:
+            warnings.warn(
+                "Single component will be treated as a mixture across the last size dimension.\n"
+                "To disable this warning do not wrap the single component inside a list or tuple",
+                UserWarning,
+            )
 
         # Check that components are not associated with a registered variable in the model
         components_ndim = set()
diff --git a/pymc/tests/test_mixture.py b/pymc/tests/test_mixture.py
index ecbaf5b52d..a22db4abe4 100644
--- a/pymc/tests/test_mixture.py
+++ b/pymc/tests/test_mixture.py
@@ -649,6 +649,15 @@ def mixmixlogp(value, point):
         assert_allclose(mixmixlogpg, mix.logp_elemwise(test_point), rtol=rtol)
         assert_allclose(priorlogp + mixmixlogpg.sum(), model.logp(test_point), rtol=rtol)
 
+    def test_iterable_single_component_warning(self):
+        with pytest.warns(None) as record:
+            Mixture.dist(w=[0.5, 0.5], comp_dists=Normal.dist(size=2))
+            Mixture.dist(w=[0.5, 0.5], comp_dists=[Normal.dist(size=2), Normal.dist(size=2)])
+        assert not record
+
+        with pytest.warns(UserWarning, match="Single component will be treated as a mixture"):
+            Mixture.dist(w=[0.5, 0.5], comp_dists=[Normal.dist(size=2)])
+
 
 class TestNormalMixture(SeededTest):
     @classmethod

From 0d99f5363110c6e63cc3133fee6abc2e09e9f01e Mon Sep 17 00:00:00 2001
From: Ricardo <ricardo.vieira1994@gmail.com>
Date: Wed, 2 Mar 2022 13:49:33 +0100
Subject: [PATCH 08/13] Update Mixture docstrings

* Emphasize equivalency between iterable of components and single batched component
* Add example with mixture of two distinct distributions
* Add example with multivariate components
---
 pymc/distributions/mixture.py | 94 +++++++++++++++++++++++------------
 1 file changed, 63 insertions(+), 31 deletions(-)

diff --git a/pymc/distributions/mixture.py b/pymc/distributions/mixture.py
index 60367830e7..9abdc9f030 100644
--- a/pymc/distributions/mixture.py
+++ b/pymc/distributions/mixture.py
@@ -67,63 +67,95 @@ class Mixture(SymbolicDistribution):
 
     Parameters
     ----------
-    w: array of floats
+    w : tensor_like of float
         w >= 0 and w <= 1
         the mixture weights
-    comp_dists: multidimensional PyMC distribution (e.g. `pm.Poisson.dist(...)`)
-        or iterable of PyMC distributions the component distributions
+    comp_dists : iterable of PyMC distributions or single batched distribution
+        Distributions should be created via the `.dist()` API. If single distribution is
+        passed, the last size dimension (not shape) determines the number of mixture
+        components (e.g. `pm.Poisson.dist(..., size=components)`)
         :math:`f_1, \ldots, f_n`
 
     Examples
     --------
     .. code-block:: python
 
-        # 2-Mixture Poisson distribution
+        # Mixture of 2 Poisson variables
         with pm.Model() as model:
-            lam = pm.Exponential('lam', lam=1, shape=(2,))  # `shape=(2,)` indicates two mixture components.
+            w = pm.Dirichlet('w', a=np.array([1, 1]))  # 2 mixture weights
 
-            # As we just need the logp, rather than add a RV to the model, we need to call .dist()
-            components = pm.Poisson.dist(mu=lam, shape=(2,))
+            lam1 = pm.Exponential('lam1', lam=1)
+            lam2 = pm.Exponential('lam2', lam=1)
 
-            w = pm.Dirichlet('w', a=np.array([1, 1]))  # two mixture component weights.
+            # As we just need the logp, rather than add a RV to the model, we need to call `.dist()`
+            # These two forms are equivalent, but the second benefits from vectorization
+            components = [
+                pm.Poisson.dist(mu=lam1),
+                pm.Poisson.dist(mu=lam2),
+            ]
+            # `shape=(2,)` indicates 2 mixture components
+            components = pm.Poisson.dist(mu=pm.math.stack([lam1, lam2]), shape=(2,))
 
             like = pm.Mixture('like', w=w, comp_dists=components, observed=data)
 
-        # 2-Mixture Poisson using iterable of distributions.
-        with pm.Model() as model:
-            lam1 = pm.Exponential('lam1', lam=1)
-            lam2 = pm.Exponential('lam2', lam=1)
 
-            pois1 = pm.Poisson.dist(mu=lam1)
-            pois2 = pm.Poisson.dist(mu=lam2)
+    .. code-block:: python
 
-            w = pm.Dirichlet('w', a=np.array([1, 1]))
+        # Mixture of Normal and StudentT variables
+        with pm.Model() as model:
+            w = pm.Dirichlet('w', a=np.array([1, 1]))  # 2 mixture weights
 
-            like = pm.Mixture('like', w=w, comp_dists = [pois1, pois2], observed=data)
+            mu = pm.Normal("mu", 0, 1)
 
-        # npop-Mixture of multidimensional Gaussian
-        npop = 5
-        nd = (3, 4)
-        with pm.Model() as model:
-            mu = pm.Normal('mu', mu=np.arange(npop), sigma=1, shape=npop) # Each component has an independent mean
+            components = [
+                pm.Normal.dist(mu=mu, sigma=1),
+                pm.StudentT.dist(nu=4, mu=mu, sigma=1),
+            ]
 
-            w = pm.Dirichlet('w', a=np.ones(npop))
+            like = pm.Mixture('like', w=w, comp_dists=components, observed=data)
 
-            components = pm.Normal.dist(mu=mu, sigma=1, shape=nd + (npop,))  # nd + (npop,) shaped multinomial
 
-            like = pm.Mixture('like', w=w, comp_dists = components, observed=data, shape=nd)  # The resulting mixture is nd-shaped
+    .. code-block:: python
 
-        # Multidimensional Mixture as stacked independent mixtures
+        # Mixture of (5 x 3) Normal variables
         with pm.Model() as model:
-            mu = pm.Normal('mu', mu=np.arange(5), sigma=1, shape=5) # Each component has an independent mean
+            # w is a stack of 5 independent size 3 weight vectors
+            # If shape was `(3,)`, the weights would be shared across the 5 replication dimensions
+            w = pm.Dirichlet('w', a=np.ones(3), shape=(5, 3))
+
+            # Each of the 3 mixture components has an independent mean
+            mu = pm.Normal('mu', mu=np.arange(3), sigma=1, shape=3)
+
+            # These two forms are equivalent, but the second benefits from vectorization
+            components = [
+                pm.Normal.dist(mu=mu[0], sigma=1, shape=(5,)),
+                pm.Normal.dist(mu=mu[1], sigma=1, shape=(5,)),
+                pm.Normal.dist(mu=mu[2], sigma=1, shape=(5,)),
+            ]
+            components = pm.Normal.dist(mu=mu, sigma=1, shape=(5, 3))
+
+            # The mixture is an array of 5 elements
+            # Each element can be thought of as an independent scalar mixture of 3
+            # components with different means
+            like = pm.Mixture('like', w=w, comp_dists=components, observed=data)
+
 
-            w = pm.Dirichlet('w', a=np.ones(3, 5))  # w is a stack of 3 independent 5 component weight arrays
+    .. code-block:: python
+
+        # Mixture of 2 Dirichlet variables
+        with pm.Model() as model:
+            w = pm.Dirichlet('w', a=np.ones(2))  # 2 mixture weights
 
-            components = pm.Normal.dist(mu=mu, sigma=1, shape=(3, 5))
+            # These two forms are equivalent, but the second benefits from vectorization
+            components = [
+                pm.Dirichlet.dist(a=[1, 10, 100], shape=(3,)),
+                pm.Dirichlet.dist(a=[100, 10, 1], shape=(3,)),
+            ]
+            components = pm.Dirichlet.dist(a=[[1, 10, 100], [100, 10, 1]], shape=(2, 3))
 
-            # The mixture is an array of 3 elements.
-            # Each can be thought of as an independent scalar mixture of 5 components
-            like = pm.Mixture('like', w=w, comp_dists = components, observed=data, shape=3)
+            # The mixture is an array of 3 elements
+            # Each element comes from only one of the two core Dirichlet components
+            like = pm.Mixture('like', w=w, comp_dists=components, observed=data)
     """
 
     @classmethod

From 524025b1f901e90caf66cb3cce2d92773c4d251c Mon Sep 17 00:00:00 2001
From: Ricardo <ricardo.vieira1994@gmail.com>
Date: Mon, 7 Mar 2022 17:06:30 +0100
Subject: [PATCH 09/13] Refactor NormalMixture

---
 pymc/distributions/mixture.py            | 38 +++++-----
 pymc/tests/test_distributions_moments.py |  2 -
 pymc/tests/test_distributions_random.py  |  3 +-
 pymc/tests/test_mixture.py               | 90 ++++++++++--------------
 4 files changed, 58 insertions(+), 75 deletions(-)

diff --git a/pymc/distributions/mixture.py b/pymc/distributions/mixture.py
index 9abdc9f030..42b7467040 100644
--- a/pymc/distributions/mixture.py
+++ b/pymc/distributions/mixture.py
@@ -24,7 +24,7 @@
 from aesara.tensor.random.op import RandomVariable
 
 from pymc.aesaraf import change_rv_size, take_along_axis
-from pymc.distributions.continuous import Normal
+from pymc.distributions.continuous import Normal, get_tau_sigma
 from pymc.distributions.dist_math import check_parameters
 from pymc.distributions.distribution import Discrete, Distribution, SymbolicDistribution
 from pymc.distributions.logprob import logp
@@ -399,7 +399,7 @@ def marginal_mixture_logprob(op, values, rng, weights, *components, **kwargs):
     return mix_logp
 
 
-class NormalMixture(Mixture):
+class NormalMixture:
     R"""
     Normal mixture log-likelihood
 
@@ -415,18 +415,18 @@ class NormalMixture(Mixture):
 
     Parameters
     ----------
-    w: array of floats
+    w : tensor_like of float
         w >= 0 and w <= 1
         the mixture weights
-    mu: array of floats
+    mu : tensor_like of float
         the component means
-    sigma: array of floats
+    sigma : tensor_like of float
         the component standard deviations
-    tau: array of floats
+    tau : tensor_like of float
         the component precisions
-    comp_shape: shape of the Normal component
+    comp_shape : shape of the Normal component
         notice that it should be different than the shape
-        of the mixture distribution, with one axis being
+        of the mixture distribution, with the last axis representing
         the number of components.
 
     Notes
@@ -442,30 +442,32 @@ class NormalMixture(Mixture):
         with pm.Model() as gauss_mix:
             μ = pm.Normal(
                 "μ",
-                data.mean(),
-                10,
+                mu=data.mean(),
+                sigma=10,
                 shape=n_components,
                 transform=pm.transforms.ordered,
                 initval=[1, 2, 3],
             )
-            σ = pm.HalfNormal("σ", 10, shape=n_components)
+            σ = pm.HalfNormal("σ", sigma=10, shape=n_components)
             weights = pm.Dirichlet("w", np.ones(n_components))
 
-            pm.NormalMixture("y", w=weights, mu=μ, sigma=σ, observed=data)
+            y = pm.NormalMixture("y", w=weights, mu=μ, sigma=σ, observed=data)
     """
 
-    def __init__(self, w, mu, sigma=None, tau=None, sd=None, comp_shape=(), *args, **kwargs):
+    def __new__(cls, name, w, mu, sigma=None, tau=None, sd=None, comp_shape=(), **kwargs):
         if sd is not None:
             sigma = sd
         _, sigma = get_tau_sigma(tau=tau, sigma=sigma)
 
-        self.mu = mu = at.as_tensor_variable(mu)
-        self.sigma = self.sd = sigma = at.as_tensor_variable(sigma)
+        return Mixture(name, w, Normal.dist(mu, sigma=sigma, size=comp_shape), **kwargs)
 
-        super().__init__(w, Normal.dist(mu, sigma=sigma, shape=comp_shape), *args, **kwargs)
+    @classmethod
+    def dist(cls, w, mu, sigma=None, tau=None, sd=None, comp_shape=(), **kwargs):
+        if sd is not None:
+            sigma = sd
+        _, sigma = get_tau_sigma(tau=tau, sigma=sigma)
 
-    def _distr_parameters_for_repr(self):
-        return ["w", "mu", "sigma"]
+        return Mixture.dist(w, Normal.dist(mu, sigma=sigma, size=comp_shape), **kwargs)
 
 
 class MixtureSameFamily(Distribution):
diff --git a/pymc/tests/test_distributions_moments.py b/pymc/tests/test_distributions_moments.py
index f9def30e9c..5018ab4ce1 100644
--- a/pymc/tests/test_distributions_moments.py
+++ b/pymc/tests/test_distributions_moments.py
@@ -100,9 +100,7 @@ def test_all_distributions_have_moments():
 
     # Distributions that have not been refactored for V4 yet
     not_implemented = {
-        dist_module.mixture.Mixture,
         dist_module.mixture.MixtureSameFamily,
-        dist_module.mixture.NormalMixture,
         dist_module.timeseries.AR,
         dist_module.timeseries.AR1,
         dist_module.timeseries.GARCH11,
diff --git a/pymc/tests/test_distributions_random.py b/pymc/tests/test_distributions_random.py
index 3dbba9c5a4..f01be90541 100644
--- a/pymc/tests/test_distributions_random.py
+++ b/pymc/tests/test_distributions_random.py
@@ -73,6 +73,7 @@ def pymc_random(
     fails=10,
     extra_args=None,
     model_args=None,
+    change_rv_size_fn=change_rv_size,
 ):
     if valuedomain is None:
         valuedomain = Domain([0], edges=(None, None))
@@ -81,7 +82,7 @@ def pymc_random(
         model_args = {}
 
     model, param_vars = build_model(dist, valuedomain, paramdomains, extra_args)
-    model_dist = change_rv_size(model.named_vars["value"], size, expand=True)
+    model_dist = change_rv_size_fn(model.named_vars["value"], size, expand=True)
     pymc_rand = aesara.function([], model_dist)
 
     domains = paramdomains.copy()
diff --git a/pymc/tests/test_mixture.py b/pymc/tests/test_mixture.py
index a22db4abe4..b159250bf6 100644
--- a/pymc/tests/test_mixture.py
+++ b/pymc/tests/test_mixture.py
@@ -660,26 +660,32 @@ def test_iterable_single_component_warning(self):
 
 
 class TestNormalMixture(SeededTest):
-    @classmethod
-    def setup_class(cls):
-        TestMixture.setup_class()
+    def test_normal_mixture_sampling(self):
+        norm_w = np.array([0.75, 0.25])
+        norm_mu = np.array([0.0, 5.0])
+        norm_sd = np.ones_like(norm_mu)
+        norm_x = generate_normal_mixture_data(norm_w, norm_mu, norm_sd, size=1000)
 
-    def test_normal_mixture(self):
         with Model() as model:
-            w = Dirichlet("w", floatX(np.ones_like(self.norm_w)), shape=self.norm_w.size)
-            mu = Normal("mu", 0.0, 10.0, shape=self.norm_w.size)
-            tau = Gamma("tau", 1.0, 1.0, shape=self.norm_w.size)
-            NormalMixture("x_obs", w, mu, tau=tau, observed=self.norm_x)
+            w = Dirichlet("w", floatX(np.ones_like(norm_w)), shape=norm_w.size)
+            mu = Normal("mu", 0.0, 10.0, shape=norm_w.size)
+            tau = Gamma("tau", 1.0, 1.0, shape=norm_w.size)
+            NormalMixture("x_obs", w, mu, tau=tau, observed=norm_x)
             step = Metropolis()
-            trace = sample(5000, step, random_seed=self.random_seed, progressbar=False, chains=1)
+            trace = sample(
+                5000,
+                step,
+                random_seed=self.random_seed,
+                progressbar=False,
+                chains=1,
+                return_inferencedata=False,
+            )
 
-        assert_allclose(np.sort(trace["w"].mean(axis=0)), np.sort(self.norm_w), rtol=0.1, atol=0.1)
-        assert_allclose(
-            np.sort(trace["mu"].mean(axis=0)), np.sort(self.norm_mu), rtol=0.1, atol=0.1
-        )
+        assert_allclose(np.sort(trace["w"].mean(axis=0)), np.sort(norm_w), rtol=0.1, atol=0.1)
+        assert_allclose(np.sort(trace["mu"].mean(axis=0)), np.sort(norm_mu), rtol=0.1, atol=0.1)
 
     @pytest.mark.parametrize(
-        "nd,ncomp", [(tuple(), 5), (1, 5), (3, 5), ((3, 3), 5), (3, 3), ((3, 3), 3)], ids=str
+        "nd, ncomp", [(tuple(), 5), (1, 5), (3, 5), ((3, 3), 5), (3, 3), ((3, 3), 3)], ids=str
     )
     def test_normal_mixture_nd(self, nd, ncomp):
         nd = to_tuple(nd)
@@ -697,7 +703,7 @@ def test_normal_mixture_nd(self, nd, ncomp):
             ws = Dirichlet("ws", np.ones(ncomp), shape=(ncomp,))
             mixture0 = NormalMixture("m", w=ws, mu=mus, tau=taus, shape=nd, comp_shape=comp_shape)
             obs0 = NormalMixture(
-                "obs", w=ws, mu=mus, tau=taus, shape=nd, comp_shape=comp_shape, observed=observed
+                "obs", w=ws, mu=mus, tau=taus, comp_shape=comp_shape, observed=observed
             )
 
         with Model() as model1:
@@ -708,53 +714,27 @@ def test_normal_mixture_nd(self, nd, ncomp):
                 Normal.dist(mu=mus[..., i], tau=taus[..., i], shape=nd) for i in range(ncomp)
             ]
             mixture1 = Mixture("m", w=ws, comp_dists=comp_dist, shape=nd)
-            obs1 = Mixture("obs", w=ws, comp_dists=comp_dist, shape=nd, observed=observed)
+            obs1 = Mixture("obs", w=ws, comp_dists=comp_dist, observed=observed)
 
         with Model() as model2:
-            # Expected to fail if comp_shape is not provided,
-            # nd is multidim and it does not broadcast with ncomp. If by chance
-            # it does broadcast, an error is raised if the mixture is given
-            # observed data.
-            # Furthermore, the Mixture will also raise errors when the observed
-            # data is multidimensional but it does not broadcast well with
-            # comp_dists.
+            # Test that results are correct without comp_shape being passed to the Mixture.
+            # This used to fail in V3
             mus = Normal("mus", shape=comp_shape)
             taus = Gamma("taus", alpha=1, beta=1, shape=comp_shape)
             ws = Dirichlet("ws", np.ones(ncomp), shape=(ncomp,))
-            if len(nd) > 1:
-                if nd[-1] != ncomp:
-                    with pytest.raises(ValueError):
-                        NormalMixture("m", w=ws, mu=mus, tau=taus, shape=nd)
-                    mixture2 = None
-                else:
-                    mixture2 = NormalMixture("m", w=ws, mu=mus, tau=taus, shape=nd)
-            else:
-                mixture2 = NormalMixture("m", w=ws, mu=mus, tau=taus, shape=nd)
-            observed_fails = False
-            if len(nd) >= 1 and nd != (1,):
-                try:
-                    np.broadcast(np.empty(comp_shape), observed)
-                except Exception:
-                    observed_fails = True
-            if observed_fails:
-                with pytest.raises(ValueError):
-                    NormalMixture("obs", w=ws, mu=mus, tau=taus, shape=nd, observed=observed)
-                obs2 = None
-            else:
-                obs2 = NormalMixture("obs", w=ws, mu=mus, tau=taus, shape=nd, observed=observed)
+            mixture2 = NormalMixture("m", w=ws, mu=mus, tau=taus, shape=nd)
+            obs2 = NormalMixture("obs", w=ws, mu=mus, tau=taus, observed=observed)
 
         testpoint = model0.compute_initial_point()
         testpoint["mus"] = test_mus
-        testpoint["taus"] = test_taus
-        assert_allclose(model0.logp(testpoint), model1.logp(testpoint))
-        assert_allclose(mixture0.logp(testpoint), mixture1.logp(testpoint))
-        assert_allclose(obs0.logp(testpoint), obs1.logp(testpoint))
-        if mixture2 is not None and obs2 is not None:
-            assert_allclose(model0.logp(testpoint), model2.logp(testpoint))
-        if mixture2 is not None:
-            assert_allclose(mixture0.logp(testpoint), mixture2.logp(testpoint))
-        if obs2 is not None:
-            assert_allclose(obs0.logp(testpoint), obs2.logp(testpoint))
+        testpoint["taus_log__"] = np.log(test_taus)
+        for logp0, logp1, logp2 in zip(
+            model0.compile_logp(vars=[mixture0, obs0], sum=False)(testpoint),
+            model1.compile_logp(vars=[mixture1, obs1], sum=False)(testpoint),
+            model2.compile_logp(vars=[mixture2, obs2], sum=False)(testpoint),
+        ):
+            assert_allclose(logp0, logp1)
+            assert_allclose(logp0, logp2)
 
     def test_random(self):
         def ref_rand(size, w, mu, sigma):
@@ -771,6 +751,7 @@ def ref_rand(size, w, mu, sigma):
             extra_args={"comp_shape": 2},
             size=1000,
             ref_rand=ref_rand,
+            change_rv_size_fn=Mixture.change_size,
         )
         pymc_random(
             NormalMixture,
@@ -782,6 +763,7 @@ def ref_rand(size, w, mu, sigma):
             extra_args={"comp_shape": 3},
             size=1000,
             ref_rand=ref_rand,
+            change_rv_size_fn=Mixture.change_size,
         )
 
 

From 7406beb6de68df18050f060d6d56cef05e64b912 Mon Sep 17 00:00:00 2001
From: Ricardo <ricardo.vieira1994@gmail.com>
Date: Mon, 28 Feb 2022 23:24:17 +0100
Subject: [PATCH 10/13] Refactor TestMixtureVsLatent

The two tests relied on implicit behavior of V3, where the dimensionality of the weights implied the support dimension of mixture distribution. This, however, led to inconsistent behavior between the random method and the logp, as the latter did not enforce this assumption, and did not distinguish if values were mixed across the implied support dimension.

In this refactoring, the support dimensionality of the component variables determines the dimensionality of the mixture distribution, regardless of the weights. This leads to consistent behavior between the random and logp methods as asserted by the new checks.

Future work will explore allowing the user to specify an artificial support dimensionality that is higher than the one implied by the component distributions, but this is for now not possible.
---
 pymc/tests/test_mixture.py | 144 ++++++++++++++++++++-----------------
 1 file changed, 77 insertions(+), 67 deletions(-)

diff --git a/pymc/tests/test_mixture.py b/pymc/tests/test_mixture.py
index b159250bf6..2fb3dc1f84 100644
--- a/pymc/tests/test_mixture.py
+++ b/pymc/tests/test_mixture.py
@@ -767,60 +767,19 @@ def ref_rand(size, w, mu, sigma):
         )
 
 
-@pytest.mark.xfail(reason="NormalMixture not refactored yet")
 class TestMixtureVsLatent(SeededTest):
-    def setup_method(self, *args, **kwargs):
-        super().setup_method(*args, **kwargs)
-        self.nd = 3
-        self.npop = 3
-        self.mus = at.as_tensor_variable(
-            np.tile(
-                np.reshape(
-                    np.arange(self.npop),
-                    (
-                        1,
-                        -1,
-                    ),
-                ),
-                (
-                    self.nd,
-                    1,
-                ),
-            )
-        )
+    """This class contains tests that compare a marginal Mixture with a latent indexed Mixture"""
 
-    def test_1d_w(self):
-        nd = self.nd
-        npop = self.npop
-        mus = self.mus
-        size = 100
-        with Model() as model:
-            m = NormalMixture(
-                "m", w=np.ones(npop) / npop, mu=mus, sigma=1e-5, comp_shape=(nd, npop), shape=nd
-            )
-            z = Categorical("z", p=np.ones(npop) / npop)
-            latent_m = Normal("latent_m", mu=mus[..., z], sigma=1e-5, shape=nd)
+    def test_scalar_components(self):
+        nd = 3
+        npop = 4
+        # [[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]]
+        mus = at.constant(np.full((nd, npop), np.arange(npop)))
 
-        m_val = m.random(size=size)
-        latent_m_val = latent_m.random(size=size)
-        assert m_val.shape == latent_m_val.shape
-        # Test that each element in axis = -1 comes from the same mixture
-        # component
-        assert all(np.all(np.diff(m_val) < 1e-3, axis=-1))
-        assert all(np.all(np.diff(latent_m_val) < 1e-3, axis=-1))
-
-        self.samples_from_same_distribution(m_val, latent_m_val)
-        self.logp_matches(m, latent_m, z, npop, model=model)
-
-    def test_2d_w(self):
-        nd = self.nd
-        npop = self.npop
-        mus = self.mus
-        size = 100
-        with Model() as model:
+        with Model(rng_seeder=self.get_random_state()) as model:
             m = NormalMixture(
                 "m",
-                w=np.ones((nd, npop)) / npop,
+                w=np.ones(npop) / npop,
                 mu=mus,
                 sigma=1e-5,
                 comp_shape=(nd, npop),
@@ -830,15 +789,55 @@ def test_2d_w(self):
             mu = at.as_tensor_variable([mus[i, z[i]] for i in range(nd)])
             latent_m = Normal("latent_m", mu=mu, sigma=1e-5, shape=nd)
 
-        m_val = m.random(size=size)
-        latent_m_val = latent_m.random(size=size)
+        size = 100
+        m_val = draw(m, draws=size)
+        latent_m_val = draw(latent_m, draws=size)
+
         assert m_val.shape == latent_m_val.shape
         # Test that each element in axis = -1 can come from independent
         # components
         assert not all(np.all(np.diff(m_val) < 1e-3, axis=-1))
         assert not all(np.all(np.diff(latent_m_val) < 1e-3, axis=-1))
+        self.samples_from_same_distribution(m_val, latent_m_val)
+
+        # Check that logp is the same whether elements of the last axis are mixed or not
+        logp_fn = model.compile_logp(vars=[m])
+        assert np.isclose(logp_fn({"m": [0, 0, 0]}), logp_fn({"m": [0, 1, 2]}))
+        self.logp_matches(m, latent_m, z, npop, model=model)
+
+    def test_vector_components(self):
+        nd = 3
+        npop = 4
+        # [[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]]
+        mus = at.constant(np.full((nd, npop), np.arange(npop)))
+
+        with Model(rng_seeder=self.get_random_state()) as model:
+            m = Mixture(
+                "m",
+                w=np.ones(npop) / npop,
+                # MvNormal distribution with squared sigma diagonal covariance should
+                # be equal to vector of Normals from latent_m
+                comp_dists=[MvNormal.dist(mus[:, i], np.eye(nd) * 1e-5**2) for i in range(npop)],
+            )
+            z = Categorical("z", p=np.ones(npop) / npop)
+            latent_m = Normal("latent_m", mu=mus[..., z], sigma=1e-5, shape=nd)
 
+        size = 100
+        m_val = draw(m, draws=size)
+        latent_m_val = draw(latent_m, draws=size)
+        assert m_val.shape == latent_m_val.shape
+        # Test that each element in axis = -1 comes from the same mixture
+        # component
+        assert np.all(np.diff(m_val) < 1e-3)
+        assert np.all(np.diff(latent_m_val) < 1e-3)
+        # TODO: The following statistical test appears to be more flaky than expected
+        #  even though the  distributions should be the same. Seeding should make it
+        #  stable but might be worth investigating further
         self.samples_from_same_distribution(m_val, latent_m_val)
+
+        # Check that mixing of values in the last axis leads to smaller logp
+        logp_fn = model.compile_logp(vars=[m])
+        assert logp_fn({"m": [0, 0, 0]}) > logp_fn({"m": [0, 1, 0]}) > logp_fn({"m": [0, 1, 2]})
         self.logp_matches(m, latent_m, z, npop, model=model)
 
     def samples_from_same_distribution(self, *args):
@@ -848,31 +847,42 @@ def samples_from_same_distribution(self, *args):
         _, p_correlation = st.ks_2samp(
             *(np.array([np.corrcoef(ss) for ss in s]).flatten() for s in args)
         )
+        # This has a success rate of 10% (0.95**2), even if the distributions are the same
         assert p_marginal >= 0.05 and p_correlation >= 0.05
 
     def logp_matches(self, mixture, latent_mix, z, npop, model):
+        def loose_logp(model, vars):
+            """Return logp function that accepts dictionary with unused variables as input"""
+            return model.compile_fn(
+                model.logpt(vars=vars, sum=False),
+                inputs=model.value_vars,
+                on_unused_input="ignore",
+            )
+
         if aesara.config.floatX == "float32":
             rtol = 1e-4
         else:
             rtol = 1e-7
         test_point = model.compute_initial_point()
-        test_point["latent_m"] = test_point["m"]
-        mix_logp = mixture.logp(test_point)
-        logps = []
+        test_point["m"] = test_point["latent_m"]
+
+        mix_logp = loose_logp(model, mixture)(test_point)[0]
+
+        z_shape = z.shape.eval()
+        latent_mix_components_logps = []
         for component in range(npop):
-            test_point["z"] = component * np.ones(z.distribution.shape)
-            # Count the number of axes that should be broadcasted from z to
-            # modify the logp
-            sh1 = test_point["z"].shape
-            sh2 = test_point["latent_m"].shape
-            if len(sh1) > len(sh2):
-                sh2 = (1,) * (len(sh1) - len(sh2)) + sh2
-            elif len(sh2) > len(sh1):
-                sh1 = (1,) * (len(sh2) - len(sh1)) + sh1
-            reps = np.prod([s2 if s1 != s2 else 1 for s1, s2 in zip(sh1, sh2)])
-            z_logp = z.logp(test_point) * reps
-            logps.append(z_logp + latent_mix.logp(test_point))
-        latent_mix_logp = logsumexp(np.array(logps), axis=0)
+            test_point["z"] = np.full(z_shape, component)
+            z_logp = loose_logp(model, z)(test_point)[0]
+            latent_mix_component_logp = loose_logp(model, latent_mix)(test_point)[0]
+            # If the mixture ndim_supp is a vector, the logp should be summed within
+            # components, as its items are not independent
+            if mix_logp.ndim == 0:
+                latent_mix_component_logp = latent_mix_component_logp.sum()
+            latent_mix_components_logps.append(z_logp + latent_mix_component_logp)
+        latent_mix_logp = logsumexp(np.array(latent_mix_components_logps), axis=0)
+        if mix_logp.ndim == 0:
+            latent_mix_logp = latent_mix_logp.sum()
+
         assert_allclose(mix_logp, latent_mix_logp, rtol=rtol)
 
 

From 34f467959e3a865e99d910781a018e462d9f3a04 Mon Sep 17 00:00:00 2001
From: Ricardo <ricardo.vieira1994@gmail.com>
Date: Tue, 1 Mar 2022 13:25:57 +0100
Subject: [PATCH 11/13] Remove MixtureSameFamily

Behavior is now implemented in Mixture
---
 docs/source/api/distributions/mixture.rst |   1 -
 pymc/distributions/__init__.py            |   3 +-
 pymc/distributions/mixture.py             | 237 +---------------------
 pymc/tests/test_distributions_moments.py  |   1 -
 pymc/tests/test_mixture.py                |  42 ++--
 5 files changed, 22 insertions(+), 262 deletions(-)

diff --git a/docs/source/api/distributions/mixture.rst b/docs/source/api/distributions/mixture.rst
index 50a34ff8b2..747e9c2621 100644
--- a/docs/source/api/distributions/mixture.rst
+++ b/docs/source/api/distributions/mixture.rst
@@ -8,4 +8,3 @@ Mixture
 
    Mixture
    NormalMixture
-   MixtureSameFamily
diff --git a/pymc/distributions/__init__.py b/pymc/distributions/__init__.py
index 8134addf85..ca7135e896 100644
--- a/pymc/distributions/__init__.py
+++ b/pymc/distributions/__init__.py
@@ -83,7 +83,7 @@
     NoDistribution,
     SymbolicDistribution,
 )
-from pymc.distributions.mixture import Mixture, MixtureSameFamily, NormalMixture
+from pymc.distributions.mixture import Mixture, NormalMixture
 from pymc.distributions.multivariate import (
     CAR,
     Dirichlet,
@@ -180,7 +180,6 @@
     "SkewNormal",
     "Mixture",
     "NormalMixture",
-    "MixtureSameFamily",
     "Triangular",
     "DiscreteWeibull",
     "Gumbel",
diff --git a/pymc/distributions/mixture.py b/pymc/distributions/mixture.py
index 42b7467040..83bcfc9512 100644
--- a/pymc/distributions/mixture.py
+++ b/pymc/distributions/mixture.py
@@ -23,16 +23,15 @@
 from aesara.tensor import TensorVariable
 from aesara.tensor.random.op import RandomVariable
 
-from pymc.aesaraf import change_rv_size, take_along_axis
+from pymc.aesaraf import change_rv_size
 from pymc.distributions.continuous import Normal, get_tau_sigma
 from pymc.distributions.dist_math import check_parameters
 from pymc.distributions.distribution import Discrete, Distribution, SymbolicDistribution
 from pymc.distributions.logprob import logp
 from pymc.distributions.shape_utils import to_tuple
-from pymc.math import logsumexp
 from pymc.util import check_dist_not_registered
 
-__all__ = ["Mixture", "NormalMixture", "MixtureSameFamily"]
+__all__ = ["Mixture", "NormalMixture"]
 
 
 def all_discrete(comp_dists):
@@ -468,235 +467,3 @@ def dist(cls, w, mu, sigma=None, tau=None, sd=None, comp_shape=(), **kwargs):
         _, sigma = get_tau_sigma(tau=tau, sigma=sigma)
 
         return Mixture.dist(w, Normal.dist(mu, sigma=sigma, size=comp_shape), **kwargs)
-
-
-class MixtureSameFamily(Distribution):
-    R"""
-    Mixture Same Family log-likelihood
-    This distribution handles mixtures of multivariate distributions in a vectorized
-    manner. It is used over Mixture distribution when the mixture components are not
-    present on the last axis of components' distribution.
-
-    .. math::f(x \mid w, \theta) = \sum_{i = 1}^n w_i f_i(x \mid \theta_i)\textrm{ Along mixture\_axis}
-
-    ========  ============================================
-    Support   :math:`\textrm{support}(f)`
-    Mean      :math:`w\mu`
-    ========  ============================================
-
-    Parameters
-    ----------
-    w: array of floats
-        w >= 0 and w <= 1
-        the mixture weights
-    comp_dists: PyMC distribution (e.g. `pm.Multinomial.dist(...)`)
-        The `comp_dists` can be scalar or multidimensional distribution.
-        Assuming its shape to be - (i_0, ..., i_n, mixture_axis, i_n+1, ..., i_N),
-        the `mixture_axis` is consumed resulting in the shape of mixture as -
-        (i_0, ..., i_n, i_n+1, ..., i_N).
-    mixture_axis: int, default = -1
-        Axis representing the mixture components to be reduced in the mixture.
-
-    Notes
-    -----
-    The default behaviour resembles Mixture distribution wherein the last axis of component
-    distribution is reduced.
-    """
-
-    def __init__(self, w, comp_dists, mixture_axis=-1, *args, **kwargs):
-        self.w = at.as_tensor_variable(w)
-        if not isinstance(comp_dists, Distribution):
-            raise TypeError(
-                "The MixtureSameFamily distribution only accepts Distribution "
-                f"instances as its components. Got {type(comp_dists)} instead."
-            )
-        self.comp_dists = comp_dists
-        if mixture_axis < 0:
-            mixture_axis = len(comp_dists.shape) + mixture_axis
-            if mixture_axis < 0:
-                raise ValueError(
-                    "`mixture_axis` is supposed to be in shape of components' distribution. "
-                    f"Got {mixture_axis + len(comp_dists.shape)} axis instead out of the bounds."
-                )
-        comp_shape = to_tuple(comp_dists.shape)
-        self.shape = comp_shape[:mixture_axis] + comp_shape[mixture_axis + 1 :]
-        self.mixture_axis = mixture_axis
-        kwargs.setdefault("dtype", self.comp_dists.dtype)
-
-        # Compute the mode so we don't always have to pass a initval
-        defaults = kwargs.pop("defaults", [])
-        event_shape = self.comp_dists.shape[mixture_axis + 1 :]
-        _w = at.shape_padleft(
-            at.shape_padright(w, len(event_shape)),
-            len(self.comp_dists.shape) - w.ndim - len(event_shape),
-        )
-        mode = take_along_axis(
-            self.comp_dists.mode,
-            at.argmax(_w, keepdims=True),
-            axis=mixture_axis,
-        )
-        self.mode = mode[(..., 0) + (slice(None),) * len(event_shape)]
-
-        if not all_discrete(comp_dists):
-            mean = at.as_tensor_variable(self.comp_dists.mean)
-            self.mean = (_w * mean).sum(axis=mixture_axis)
-            if "mean" not in defaults:
-                defaults.append("mean")
-        defaults.append("mode")
-
-        super().__init__(defaults=defaults, *args, **kwargs)
-
-    def logp(self, value):
-        """
-        Calculate log-probability of defined ``MixtureSameFamily`` distribution at specified value.
-
-        Parameters
-        ----------
-        value : numeric
-            Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or Aesara tensor
-
-        Returns
-        -------
-        TensorVariable
-        """
-
-        comp_dists = self.comp_dists
-        w = self.w
-        mixture_axis = self.mixture_axis
-
-        event_shape = comp_dists.shape[mixture_axis + 1 :]
-
-        # To be able to broadcast the comp_dists.logp with w and value
-        # We first have to pad the shape of w to the right with ones
-        # so that it can broadcast with the event_shape.
-
-        w = at.shape_padright(w, len(event_shape))
-
-        # Second, we have to add the mixture_axis to the value tensor
-        # To insert the mixture axis at the correct location, we use the
-        # negative number index. This way, we can also handle situations
-        # in which, value is an observed value with more batch dimensions
-        # than the ones present in the comp_dists.
-        comp_dists_ndim = len(comp_dists.shape)
-
-        value = at.shape_padaxis(value, axis=mixture_axis - comp_dists_ndim)
-
-        comp_logp = comp_dists.logp(value)
-        return check_parameters(
-            logsumexp(at.log(w) + comp_logp, axis=mixture_axis, keepdims=False),
-            w >= 0,
-            w <= 1,
-            at.allclose(w.sum(axis=mixture_axis - comp_dists_ndim), 1),
-            broadcast_conditions=False,
-        )
-
-    def random(self, point=None, size=None):
-        """
-        Draw random values from defined ``MixtureSameFamily`` distribution.
-
-        Parameters
-        ----------
-        point : dict, optional
-            Dict of variable values on which random values are to be
-            conditioned (uses default point if not specified).
-        size : int, optional
-            Desired size of random sample (returns one sample if not
-            specified).
-
-        Returns
-        -------
-        array
-        """
-        # sample_shape = to_tuple(size)
-        # mixture_axis = self.mixture_axis
-        #
-        # # First we draw values for the mixture component weights
-        # (w,) = draw_values([self.w], point=point, size=size)
-        #
-        # # We now draw random choices from those weights.
-        # # However, we have to ensure that the number of choices has the
-        # # sample_shape present.
-        # w_shape = w.shape
-        # batch_shape = self.comp_dists.shape[: mixture_axis + 1]
-        # param_shape = np.broadcast(np.empty(w_shape), np.empty(batch_shape)).shape
-        # event_shape = self.comp_dists.shape[mixture_axis + 1 :]
-        #
-        # if np.asarray(self.shape).size != 0:
-        #     comp_dists_ndim = len(self.comp_dists.shape)
-        #
-        #     # If event_shape of both comp_dists and supplied shape matches,
-        #     # broadcast only batch_shape
-        #     # else broadcast the entire given shape with batch_shape.
-        #     if list(self.shape[mixture_axis - comp_dists_ndim + 1 :]) == list(event_shape):
-        #         dist_shape = np.broadcast(
-        #             np.empty(self.shape[:mixture_axis]), np.empty(param_shape[:mixture_axis])
-        #         ).shape
-        #     else:
-        #         dist_shape = np.broadcast(
-        #             np.empty(self.shape), np.empty(param_shape[:mixture_axis])
-        #         ).shape
-        # else:
-        #     dist_shape = param_shape[:mixture_axis]
-        #
-        # # Try to determine the size that must be used to get the mixture
-        # # components (i.e. get random choices using w).
-        # # 1. There must be size independent choices based on w.
-        # # 2. There must also be independent draws for each non singleton axis
-        # # of w.
-        # # 3. There must also be independent draws for each dimension added by
-        # # self.shape with respect to the w.ndim. These usually correspond to
-        # # observed variables with batch shapes
-        # wsh = (1,) * (len(dist_shape) - len(w_shape) + 1) + w_shape[:mixture_axis]
-        # psh = (1,) * (len(dist_shape) - len(param_shape) + 1) + param_shape[:mixture_axis]
-        # w_sample_size = []
-        # # Loop through the dist_shape to get the conditions 2 and 3 first
-        # for i in range(len(dist_shape)):
-        #     if dist_shape[i] != psh[i] and wsh[i] == 1:
-        #         # self.shape[i] is a non singleton dimension (usually caused by
-        #         # observed data)
-        #         sh = dist_shape[i]
-        #     else:
-        #         sh = wsh[i]
-        #     w_sample_size.append(sh)
-        #
-        # if sample_shape is not None and w_sample_size[: len(sample_shape)] != sample_shape:
-        #     w_sample_size = sample_shape + tuple(w_sample_size)
-        #
-        # choices = random_choice(p=w, size=w_sample_size)
-        #
-        # # We now draw samples from the mixture components random method
-        # comp_samples = self.comp_dists.random(point=point, size=size)
-        # if comp_samples.shape[: len(sample_shape)] != sample_shape:
-        #     comp_samples = np.broadcast_to(
-        #         comp_samples,
-        #         shape=sample_shape + comp_samples.shape,
-        #     )
-        #
-        # # At this point the shapes of the arrays involved are:
-        # # comp_samples.shape = (sample_shape, batch_shape, mixture_axis, event_shape)
-        # # choices.shape = (sample_shape, batch_shape)
-        # #
-        # # To be able to take the choices along the mixture_axis of the
-        # # comp_samples, we have to add in dimensions to the right of the
-        # # choices array.
-        # # We also need to make sure that the batch_shapes of both the comp_samples
-        # # and choices broadcast with each other.
-        #
-        # choices = np.reshape(choices, choices.shape + (1,) * (1 + len(event_shape)))
-        #
-        # choices, comp_samples = get_broadcastable_dist_samples([choices, comp_samples], size=size)
-        #
-        # # We now take the choices of the mixture components along the mixture_axis
-        # # but we use the negative index representation to be able to handle the
-        # # sample_shape
-        # samples = np.take_along_axis(
-        #     comp_samples, choices, axis=mixture_axis - len(self.comp_dists.shape)
-        # )
-        #
-        # # The `samples` array still has the `mixture_axis`, so we must remove it:
-        # output = samples[(..., 0) + (slice(None),) * len(event_shape)]
-        # return output
-
-    def _distr_parameters_for_repr(self):
-        return []
diff --git a/pymc/tests/test_distributions_moments.py b/pymc/tests/test_distributions_moments.py
index 5018ab4ce1..6b3aa60fe4 100644
--- a/pymc/tests/test_distributions_moments.py
+++ b/pymc/tests/test_distributions_moments.py
@@ -100,7 +100,6 @@ def test_all_distributions_have_moments():
 
     # Distributions that have not been refactored for V4 yet
     not_implemented = {
-        dist_module.mixture.MixtureSameFamily,
         dist_module.timeseries.AR,
         dist_module.timeseries.AR1,
         dist_module.timeseries.GARCH11,
diff --git a/pymc/tests/test_mixture.py b/pymc/tests/test_mixture.py
index 2fb3dc1f84..8be809d9a8 100644
--- a/pymc/tests/test_mixture.py
+++ b/pymc/tests/test_mixture.py
@@ -34,7 +34,6 @@
     LKJCholeskyCov,
     LogNormal,
     Mixture,
-    MixtureSameFamily,
     Multinomial,
     MvNormal,
     Normal,
@@ -886,8 +885,12 @@ def loose_logp(model, vars):
         assert_allclose(mix_logp, latent_mix_logp, rtol=rtol)
 
 
-@pytest.mark.xfail(reason="MixtureSameFamily not refactored yet")
 class TestMixtureSameFamily(SeededTest):
+    """Tests that used to belong to deprecated `TestMixtureSameFamily`.
+
+    The functionality is now expected to be provided by `Mixture`
+    """
+
     @classmethod
     def setup_class(cls):
         super().setup_class()
@@ -903,17 +906,16 @@ def test_with_multinomial(self, batch_shape):
         mixture_axis = len(batch_shape)
         with Model() as model:
             comp_dists = Multinomial.dist(p=p, n=n, shape=(*batch_shape, self.mixture_comps, 3))
-            mixture = MixtureSameFamily(
+            mixture = Mixture(
                 "mixture",
                 w=w,
                 comp_dists=comp_dists,
-                mixture_axis=mixture_axis,
                 shape=(*batch_shape, 3),
             )
-            prior = sample_prior_predictive(samples=self.n_samples)
+            prior = sample_prior_predictive(samples=self.n_samples, return_inferencedata=False)
 
         assert prior["mixture"].shape == (self.n_samples, *batch_shape, 3)
-        assert mixture.random(size=self.size).shape == (self.size, *batch_shape, 3)
+        assert draw(mixture, draws=self.size).shape == (self.size, *batch_shape, 3)
 
         if aesara.config.floatX == "float32":
             rtol = 1e-4
@@ -921,18 +923,16 @@ def test_with_multinomial(self, batch_shape):
             rtol = 1e-7
 
         initial_point = model.compute_initial_point()
-        comp_logp = comp_dists.logp(initial_point["mixture"].reshape(*batch_shape, 1, 3))
+        comp_logp = logp(comp_dists, initial_point["mixture"].reshape(*batch_shape, 1, 3))
         log_sum_exp = logsumexp(
-            comp_logp.eval() + np.log(w)[..., None], axis=mixture_axis, keepdims=True
+            comp_logp.eval() + np.log(w), axis=mixture_axis, keepdims=True
         ).sum()
         assert_allclose(
-            model.logp(initial_point),
+            model.compile_logp()(initial_point),
             log_sum_exp,
             rtol,
         )
 
-    # TODO: Handle case when `batch_shape` == `sample_shape`.
-    # See https://github.com/pymc-devs/pymc/issues/4185 for details.
     def test_with_mvnormal(self):
         # 10 batch, 3-variate Gaussian
         mu = np.random.randn(self.mixture_comps, 3)
@@ -943,13 +943,11 @@ def test_with_mvnormal(self):
 
         with Model() as model:
             comp_dists = MvNormal.dist(mu=mu, chol=chol, shape=(self.mixture_comps, 3))
-            mixture = MixtureSameFamily(
-                "mixture", w=w, comp_dists=comp_dists, mixture_axis=0, shape=(3,)
-            )
-            prior = sample_prior_predictive(samples=self.n_samples)
+            mixture = Mixture("mixture", w=w, comp_dists=comp_dists, shape=(3,))
+            prior = sample_prior_predictive(samples=self.n_samples, return_inferencedata=False)
 
         assert prior["mixture"].shape == (self.n_samples, 3)
-        assert mixture.random(size=self.size).shape == (self.size, 3)
+        assert draw(mixture, draws=self.size).shape == (self.size, 3)
 
         if aesara.config.floatX == "float32":
             rtol = 1e-4
@@ -957,12 +955,10 @@ def test_with_mvnormal(self):
             rtol = 1e-7
 
         initial_point = model.compute_initial_point()
-        comp_logp = comp_dists.logp(initial_point["mixture"].reshape(1, 3))
-        log_sum_exp = logsumexp(
-            comp_logp.eval() + np.log(w)[..., None], axis=0, keepdims=True
-        ).sum()
+        comp_logp = logp(comp_dists, initial_point["mixture"].reshape(1, 3))
+        log_sum_exp = logsumexp(comp_logp.eval() + np.log(w), axis=0, keepdims=True).sum()
         assert_allclose(
-            model.logp(initial_point),
+            model.compile_logp()(initial_point),
             log_sum_exp,
             rtol,
         )
@@ -971,7 +967,7 @@ def test_broadcasting_in_shape(self):
         with Model() as model:
             mu = Gamma("mu", 1.0, 1.0, shape=2)
             comp_dists = Poisson.dist(mu, shape=2)
-            mix = MixtureSameFamily("mix", w=np.ones(2) / 2, comp_dists=comp_dists, shape=(1000,))
-            prior = sample_prior_predictive(samples=self.n_samples)
+            mix = Mixture("mix", w=np.ones(2) / 2, comp_dists=comp_dists, shape=(1000,))
+            prior = sample_prior_predictive(samples=self.n_samples, return_inferencedata=False)
 
         assert prior["mix"].shape == (self.n_samples, 1000)

From 04baabd5e9cba89e72b341f36bb0b6540e500839 Mon Sep 17 00:00:00 2001
From: Larry Dong <larry.dong@mail.utoronto.ca>
Date: Tue, 1 Feb 2022 15:54:25 +0100
Subject: [PATCH 12/13] Add Mixture moments

---
 pymc/distributions/mixture.py |  26 ++++-
 pymc/tests/test_mixture.py    | 191 ++++++++++++++++++++++++++++++++++
 2 files changed, 216 insertions(+), 1 deletion(-)

diff --git a/pymc/distributions/mixture.py b/pymc/distributions/mixture.py
index 83bcfc9512..0e06fef5f8 100644
--- a/pymc/distributions/mixture.py
+++ b/pymc/distributions/mixture.py
@@ -26,7 +26,13 @@
 from pymc.aesaraf import change_rv_size
 from pymc.distributions.continuous import Normal, get_tau_sigma
 from pymc.distributions.dist_math import check_parameters
-from pymc.distributions.distribution import Discrete, Distribution, SymbolicDistribution
+from pymc.distributions.distribution import (
+    Discrete,
+    Distribution,
+    SymbolicDistribution,
+    _get_moment,
+    get_moment,
+)
 from pymc.distributions.logprob import logp
 from pymc.distributions.shape_utils import to_tuple
 from pymc.util import check_dist_not_registered
@@ -398,6 +404,24 @@ def marginal_mixture_logprob(op, values, rng, weights, *components, **kwargs):
     return mix_logp
 
 
+@_get_moment.register(MarginalMixtureRV)
+def get_moment_marginal_mixture(op, rv, rng, weights, *components):
+    ndim_supp = components[0].owner.op.ndim_supp
+    weights = at.shape_padright(weights, ndim_supp)
+    mix_axis = -ndim_supp - 1
+
+    if len(components) == 1:
+        moment_components = get_moment(components[0])
+
+    else:
+        moment_components = at.stack(
+            [get_moment(component) for component in components],
+            axis=mix_axis,
+        )
+
+    return at.sum(weights * moment_components, axis=mix_axis)
+
+
 class NormalMixture:
     R"""
     Normal mixture log-likelihood
diff --git a/pymc/tests/test_mixture.py b/pymc/tests/test_mixture.py
index 8be809d9a8..51754aa788 100644
--- a/pymc/tests/test_mixture.py
+++ b/pymc/tests/test_mixture.py
@@ -53,6 +53,7 @@
 from pymc.step_methods import Metropolis
 from pymc.tests.helpers import SeededTest
 from pymc.tests.test_distributions import Domain, Simplex
+from pymc.tests.test_distributions_moments import assert_moment_is_expected
 from pymc.tests.test_distributions_random import pymc_random
 
 
@@ -971,3 +972,193 @@ def test_broadcasting_in_shape(self):
             prior = sample_prior_predictive(samples=self.n_samples, return_inferencedata=False)
 
         assert prior["mix"].shape == (self.n_samples, 1000)
+
+
+class TestMixtureMoments:
+    @pytest.mark.parametrize(
+        "weights, comp_dists, size, expected",
+        [
+            (
+                np.array([0.4, 0.6]),
+                Normal.dist(mu=np.array([-2, 6]), sigma=np.array([5, 3])),
+                None,
+                2.8,
+            ),
+            (
+                np.tile(1 / 13, 13),
+                Normal.dist(-2, 1, size=(13,)),
+                (3,),
+                np.full((3,), -2),
+            ),
+            (
+                np.array([0.4, 0.6]),
+                Normal.dist([-2, 6], 3),
+                (5, 3),
+                np.full((5, 3), 2.8),
+            ),
+            (
+                np.broadcast_to(np.array([0.4, 0.6]), (5, 3, 2)),
+                Normal.dist(np.array([-2, 6]), np.array([5, 3])),
+                None,
+                np.full(shape=(5, 3), fill_value=2.8),
+            ),
+            (
+                np.array([0.4, 0.6]),
+                Normal.dist(np.array([-2, 6]), np.array([5, 3]), size=(5, 3, 2)),
+                None,
+                np.full(shape=(5, 3), fill_value=2.8),
+            ),
+            (
+                np.array([[0.8, 0.2], [0.2, 0.8]]),
+                Normal.dist(np.array([-2, 6])),
+                None,
+                np.array([-0.4, 4.4]),
+            ),
+            # implied size = (11, 7) will be overwritten by (5, 3)
+            (
+                np.array([0.4, 0.6]),
+                Normal.dist(np.array([-2, 6]), np.array([5, 3]), size=(11, 7, 2)),
+                (5, 3),
+                np.full(shape=(5, 3), fill_value=2.8),
+            ),
+        ],
+    )
+    def test_single_univariate_component(self, weights, comp_dists, size, expected):
+        with Model() as model:
+            Mixture("x", weights, comp_dists, size=size)
+        assert_moment_is_expected(model, expected, check_finite_logp=False)
+
+    @pytest.mark.parametrize(
+        "weights, comp_dists, size, expected",
+        [
+            (
+                np.array([1, 0]),
+                [Normal.dist(-2, 5), Normal.dist(6, 3)],
+                None,
+                -2,
+            ),
+            (
+                np.array([0.4, 0.6]),
+                [Normal.dist(-2, 5, size=(2,)), Normal.dist(6, 3, size=(2,))],
+                None,
+                np.full((2,), 2.8),
+            ),
+            (
+                np.array([0.5, 0.5]),
+                [Normal.dist(-2, 5), Exponential.dist(lam=1 / 3)],
+                (3, 5),
+                np.full((3, 5), 0.5),
+            ),
+            (
+                np.broadcast_to(np.array([0.4, 0.6]), (5, 3, 2)),
+                [Normal.dist(-2, 5), Normal.dist(6, 3)],
+                None,
+                np.full(shape=(5, 3), fill_value=2.8),
+            ),
+            (
+                np.array([[0.8, 0.2], [0.2, 0.8]]),
+                [Normal.dist(-2, 5), Normal.dist(6, 3)],
+                None,
+                np.array([-0.4, 4.4]),
+            ),
+            (
+                np.array([[0.8, 0.2], [0.2, 0.8]]),
+                [Normal.dist(-2, 5), Normal.dist(6, 3)],
+                (3, 2),
+                np.full((3, 2), np.array([-0.4, 4.4])),
+            ),
+            (
+                # implied size = (11, 7) will be overwritten by (5, 3)
+                np.array([0.4, 0.6]),
+                [Normal.dist(-2, 5, size=(11, 7)), Normal.dist(6, 3, size=(11, 7))],
+                (5, 3),
+                np.full(shape=(5, 3), fill_value=2.8),
+            ),
+        ],
+    )
+    def test_list_univariate_components(self, weights, comp_dists, size, expected):
+        with Model() as model:
+            Mixture("x", weights, comp_dists, size=size)
+        assert_moment_is_expected(model, expected, check_finite_logp=False)
+
+    @pytest.mark.parametrize(
+        "weights, comp_dists, size, expected",
+        [
+            (
+                np.array([0.4, 0.6]),
+                MvNormal.dist(mu=np.array([[-1, -2], [3, 5]]), cov=np.eye(2) * 0.3),
+                None,
+                np.array([1.4, 2.2]),
+            ),
+            (
+                np.array([0.5, 0.5]),
+                Dirichlet.dist(a=np.array([[0.0001, 0.0001, 1000], [2, 4, 6]])),
+                (4,),
+                np.array(np.full((4, 3), [1 / 12, 1 / 6, 3 / 4])),
+            ),
+            (
+                np.array([0.4, 0.6]),
+                MvNormal.dist(mu=np.array([-10, 0, 10]), cov=np.eye(3) * 3, size=(4, 2)),
+                None,
+                np.full((4, 3), [-10, 0, 10]),
+            ),
+            (
+                np.array([[1.0, 0], [0.0, 1.0]]),
+                MvNormal.dist(
+                    mu=np.array([[-5, -10, -15], [5, 10, 15]]), cov=np.eye(3) * 3, size=(2,)
+                ),
+                (3, 2),
+                np.full((3, 2, 3), [[-5, -10, -15], [5, 10, 15]]),
+            ),
+        ],
+    )
+    def test_single_multivariate_component(self, weights, comp_dists, size, expected):
+        with Model() as model:
+            Mixture("x", weights, comp_dists, size=size)
+        assert_moment_is_expected(model, expected, check_finite_logp=False)
+
+    @pytest.mark.parametrize(
+        "weights, comp_dists, size, expected",
+        [
+            (
+                np.array([0.4, 0.6]),
+                [
+                    MvNormal.dist(mu=np.array([-1, -2]), cov=np.eye(2) * 0.3),
+                    MvNormal.dist(mu=np.array([3, 5]), cov=np.eye(2) * 0.8),
+                ],
+                None,
+                np.array([1.4, 2.2]),
+            ),
+            (
+                np.array([0.4, 0.6]),
+                [
+                    Dirichlet.dist(a=np.array([2, 3, 5])),
+                    MvNormal.dist(mu=np.array([-10, 0, 10]), cov=np.eye(3) * 3),
+                ],
+                (4,),
+                np.array(np.full((4, 3), [-5.92, 0.12, 6.2])),
+            ),
+            (
+                np.array([0.4, 0.6]),
+                [
+                    Dirichlet.dist(a=np.array([2, 3, 5]), size=(2,)),
+                    MvNormal.dist(mu=np.array([-10, 0, 10]), cov=np.eye(3) * 3, size=(2,)),
+                ],
+                None,
+                np.full((2, 3), [-5.92, 0.12, 6.2]),
+            ),
+            (
+                np.array([[1.0, 0], [0.0, 1.0]]),
+                [
+                    MvNormal.dist(mu=np.array([-5, -10, -15]), cov=np.eye(3) * 3, size=(2,)),
+                    MvNormal.dist(mu=np.array([5, 10, 15]), cov=np.eye(3) * 3, size=(2,)),
+                ],
+                (3, 2),
+                np.full((3, 2, 3), [[-5, -10, -15], [5, 10, 15]]),
+            ),
+        ],
+    )
+    def test_list_multivariate_components(self, weights, comp_dists, size, expected):
+        with Model() as model:
+            Mixture("x", weights, comp_dists, size=size)
+        assert_moment_is_expected(model, expected, check_finite_logp=False)

From 0e442d1bc6c45bb5f741427825c0d451492839fe Mon Sep 17 00:00:00 2001
From: Ricardo <ricardo.vieira1994@gmail.com>
Date: Tue, 1 Mar 2022 14:23:58 +0100
Subject: [PATCH 13/13] Update release notes

---
 RELEASE-NOTES.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
index 0094cc29b7..ce265b5d55 100644
--- a/RELEASE-NOTES.md
+++ b/RELEASE-NOTES.md
@@ -13,7 +13,7 @@ Instead update the vNext section until 4.0.0 is out.
 ### Not-yet working features
 We plan to get these working again, but at this point their inner workings have not been refactored.
 - Timeseries distributions (see [#4642](https://github.com/pymc-devs/pymc/issues/4642))
-- Mixture distributions (see [#4781](https://github.com/pymc-devs/pymc/issues/4781))
+- Nested Mixture distributions (see [#5533](https://github.com/pymc-devs/pymc/issues/5533))
 - Elliptical slice sampling (see [#5137](https://github.com/pymc-devs/pymc/issues/5137))
 - `BaseStochasticGradient` (see [#5138](https://github.com/pymc-devs/pymc/issues/5138))
 - `pm.sample_posterior_predictive_w` (see [#4807](https://github.com/pymc-devs/pymc/issues/4807))
@@ -72,6 +72,7 @@ All of the above apply to:
   - In the gp.utils file, the `kmeans_inducing_points` function now passes through `kmeans_kwargs` to scipy's k-means function.
   - The function `replace_with_values` function has been added to `gp.utils`.
   - `MarginalSparse` has been renamed `MarginalApprox`.
+  - Removed `MixtureSameFamily`. `Mixture` is now capable of handling batched multivariate components (see [#5438](https://github.com/pymc-devs/pymc/pull/5438)).
 - ...
 
 ### Expected breaks