From b2836499d37afb091a2e85bf166031947e09a808 Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Sun, 24 Jan 2021 23:09:29 -0600
Subject: [PATCH 01/44] Initial refactoring for RandomVariable use in Model,
 step methods, and basic dists

These changes can be summarized as follows:
- `Model` objects now track fully functional Theano graphs that represent all
relationships between random and "deterministic" variables.  These graphs are
called these "sample-space" graphs.  `Model.unobserved_RVs`, `Model.basic_RVs`,
`Model.free_RVs`, and `Model.observed_RVs` contain these
graphs (i.e. `TensorVariable`s), which are generated by `RandomVariable` `Op`s.
- For each random variable, there is now a corresponding "measure-space"
variable (i.e. a `TensorVariable` that corresponds to said variable in a
log-likelihood graph).  These variables are available as `rv_var.tag.value_var`,
for each random variable `rv_var`, or via `Model.vars`.
- Log-likelihood (i.e. measure-space) graphs are now created for individual
random variables by way of the generic functions `logpt`, `logcdf`,
`logp_nojac`, and `logpt_sum` in `pymc3.distributions`.
- Numerous uses of concrete shape information stemming from `Model`
objects (e.g. `Model.size`) have been removed/refactored.
- Use of `FreeRV`, `ObservedRV`, `MultiObservedRV`, and `TransformedRV` has been
deprecated.  The information previously stored in these classes is now tracked
using `TensorVariable.tag`, and log-likelihoods are generated using the
aforementioned `log*` generic functions.
---
 pymc3/backends/base.py              |  13 +-
 pymc3/distributions/__init__.py     | 407 +++++++++++++++++-
 pymc3/distributions/continuous.py   | 300 +++++++-------
 pymc3/distributions/discrete.py     | 182 ++++----
 pymc3/distributions/distribution.py |  69 +++-
 pymc3/distributions/multivariate.py | 111 ++---
 pymc3/distributions/transforms.py   | 197 +++++----
 pymc3/model.py                      | 617 ++++++++++++++++++++--------
 pymc3/sampling.py                   |  16 +-
 pymc3/step_methods/gibbs.py         |   2 +-
 pymc3/step_methods/hmc/base_hmc.py  |   6 +-
 pymc3/tests/sampler_fixtures.py     |   2 +-
 pymc3/tests/test_model.py           |  50 ++-
 pymc3/tests/test_model_helpers.py   | 157 +++++++
 pymc3/tuning/starting.py            |   7 +-
 pymc3/util.py                       |  58 ++-
 16 files changed, 1536 insertions(+), 658 deletions(-)
 create mode 100644 pymc3/tests/test_model_helpers.py

diff --git a/pymc3/backends/base.py b/pymc3/backends/base.py
index e9227cfd95..3a5f37f3fa 100644
--- a/pymc3/backends/base.py
+++ b/pymc3/backends/base.py
@@ -61,18 +61,7 @@ def __init__(self, name, model=None, vars=None, test_point=None):
         model = modelcontext(model)
         self.model = model
         if vars is None:
-            vars = []
-            for v in model.unobserved_RVs:
-                var = getattr(v.tag, "value_var", v)
-                transform = getattr(var.tag, "transform", None)
-                if transform:
-                    # We need to create and add an un-transformed version of
-                    # each transformed variable
-                    untrans_var = transform.backward(v, var)
-                    untrans_var.name = v.name
-                    vars.append(untrans_var)
-                vars.append(var)
-
+            vars = [v.tag.value_var for v in model.unobserved_RVs]
         self.vars = vars
         self.varnames = [var.name for var in vars]
         self.fn = model.fastfn(vars)
diff --git a/pymc3/distributions/__init__.py b/pymc3/distributions/__init__.py
index d92dad0cfe..648f35d392 100644
--- a/pymc3/distributions/__init__.py
+++ b/pymc3/distributions/__init__.py
@@ -11,16 +11,377 @@
 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
+from functools import singledispatch
+from typing import Generator, List, Optional, Tuple, Union
 
-from pymc3.distributions.logp import (  # isort:skip
-    _logcdf,
-    _logp,
-    logcdf,
-    logp_transform,
-    logpt,
-    logpt_sum,
-)
+import aesara.tensor as aet
+import numpy as np
+
+from aesara import config
+from aesara.graph.basic import Variable, ancestors, clone_replace
+from aesara.graph.op import compute_test_value
+from aesara.tensor.random.op import Observed, RandomVariable
+from aesara.tensor.subtensor import AdvancedSubtensor, AdvancedSubtensor1, Subtensor
+from aesara.tensor.var import TensorVariable
+
+from pymc3.aesaraf import floatX
+
+PotentialShapeType = Union[
+    int, np.ndarray, Tuple[Union[int, Variable], ...], List[Union[int, Variable]], Variable
+]
+
+
+def _get_scaling(total_size, shape, ndim):
+    """
+    Gets scaling constant for logp
+
+    Parameters
+    ----------
+    total_size: int or list[int]
+    shape: shape
+        shape to scale
+    ndim: int
+        ndim hint
+
+    Returns
+    -------
+    scalar
+    """
+    if total_size is None:
+        coef = floatX(1)
+    elif isinstance(total_size, int):
+        if ndim >= 1:
+            denom = shape[0]
+        else:
+            denom = 1
+        coef = floatX(total_size) / floatX(denom)
+    elif isinstance(total_size, (list, tuple)):
+        if not all(isinstance(i, int) for i in total_size if (i is not Ellipsis and i is not None)):
+            raise TypeError(
+                "Unrecognized `total_size` type, expected "
+                "int or list of ints, got %r" % total_size
+            )
+        if Ellipsis in total_size:
+            sep = total_size.index(Ellipsis)
+            begin = total_size[:sep]
+            end = total_size[sep + 1 :]
+            if Ellipsis in end:
+                raise ValueError(
+                    "Double Ellipsis in `total_size` is restricted, got %r" % total_size
+                )
+        else:
+            begin = total_size
+            end = []
+        if (len(begin) + len(end)) > ndim:
+            raise ValueError(
+                "Length of `total_size` is too big, "
+                "number of scalings is bigger that ndim, got %r" % total_size
+            )
+        elif (len(begin) + len(end)) == 0:
+            return floatX(1)
+        if len(end) > 0:
+            shp_end = shape[-len(end) :]
+        else:
+            shp_end = np.asarray([])
+        shp_begin = shape[: len(begin)]
+        begin_coef = [floatX(t) / shp_begin[i] for i, t in enumerate(begin) if t is not None]
+        end_coef = [floatX(t) / shp_end[i] for i, t in enumerate(end) if t is not None]
+        coefs = begin_coef + end_coef
+        coef = aet.prod(coefs)
+    else:
+        raise TypeError(
+            "Unrecognized `total_size` type, expected int or list of ints, got %r" % total_size
+        )
+    return aet.as_tensor(floatX(coef))
+
+
+def change_rv_size(
+    rv_var: TensorVariable,
+    new_size: PotentialShapeType,
+    expand: Optional[bool] = False,
+) -> TensorVariable:
+    """Change or expand the size of a `RandomVariable`.
+
+    Parameters
+    ==========
+    rv_var
+        The `RandomVariable` output.
+    new_size
+        The new size.
+    expand:
+        Whether or not to completely replace the `size` parameter in `rv_var`
+        with `new_size` or simply prepend it to the existing `size`.
+
+    """
+    rv_node = rv_var.owner
+    rng, size, dtype, *dist_params = rv_node.inputs
+    name = rv_var.name
+    tag = rv_var.tag
+
+    if expand:
+        new_size = tuple(np.atleast_1d(new_size)) + tuple(size)
+
+    new_rv_node = rv_node.op.make_node(rng, new_size, dtype, *dist_params)
+    rv_var = new_rv_node.outputs[-1]
+    rv_var.name = name
+    for k, v in tag.__dict__.items():
+        rv_var.tag.__dict__.setdefault(k, v)
+
+    if config.compute_test_value != "off":
+        compute_test_value(new_rv_node)
+
+    return rv_var
+
+
+def rv_log_likelihood_args(
+    rv_var: TensorVariable,
+    rv_value: Optional[TensorVariable] = None,
+    transformed: Optional[bool] = True,
+) -> Tuple[TensorVariable, TensorVariable]:
+    """Get a `RandomVariable` and its corresponding log-likelihood `TensorVariable` value.
+
+    Parameters
+    ==========
+    rv_var
+        A variable corresponding to a `RandomVariable`, whether directly or
+        indirectly (e.g. an observed variable that's the output of an
+        `Observed` `Op`).
+    rv_value
+        The measure-space input `TensorVariable` (i.e. "input" to a
+        log-likelihood).
+    transformed
+        When ``True``, return the transformed value var.
+
+    Returns
+    =======
+    The first value in the tuple is the `RandomVariable`, and the second is the
+    measure-space variable that corresponds with the latter.  The first is used
+    to determine the log likelihood graph and the second is the "input"
+    parameter to that graph.  In the case of an observed `RandomVariable`, the
+    "input" is actual data; in all other cases, it's just another
+    `TensorVariable`.
+
+    """
+
+    if rv_value is None:
+        if rv_var.owner and isinstance(rv_var.owner.op, Observed):
+            rv_var, rv_value = rv_var.owner.inputs
+        elif hasattr(rv_var.tag, "value_var"):
+            rv_value = rv_var.tag.value_var
+        else:
+            return rv_var, None
+
+    rv_value = aet.as_tensor_variable(rv_value)
+
+    transform = getattr(rv_value.tag, "transform", None)
+    if transformed and transform:
+        rv_value = transform.forward(rv_value)
+
+    return rv_var, rv_value
+
+
+def rv_ancestors(graphs: List[TensorVariable]) -> Generator[TensorVariable, None, None]:
+    """Yield the ancestors that are `RandomVariable` outputs for the given `graphs`."""
+    for anc in ancestors(graphs):
+        if anc in graphs:
+            continue
+        if anc.owner and isinstance(anc.owner.op, RandomVariable):
+            yield anc
+
+
+def strip_observed(x: TensorVariable) -> TensorVariable:
+    """Return the `RandomVariable` term for an `Observed` node input; otherwise, return the input."""
+    if x.owner and isinstance(x.owner.op, Observed):
+        return x.owner.inputs[0]
+    else:
+        return x
+
+
+def sample_to_measure_vars(graphs: List[TensorVariable]) -> List[TensorVariable]:
+    """Replace `RandomVariable` terms in graphs with their measure-space counterparts."""
+    replace = {}
+    for anc in ancestors(graphs):
+        if anc.owner and isinstance(anc.owner.op, RandomVariable):
+            measure_var = getattr(anc.tag, "value_var", None)
+            if measure_var is not None:
+                replace[anc] = measure_var
+
+    dist_params = clone_replace(graphs, replace=replace)
+    return dist_params
+
+
+def logpt(
+    rv_var: TensorVariable,
+    rv_value: Optional[TensorVariable] = None,
+    jacobian: bool = True,
+    scaling: Optional[bool] = True,
+    **kwargs,
+) -> TensorVariable:
+    """Create a measure-space (i.e. log-likelihood) graph for a random variable at a given point.
+
+    The input `rv_var` determines which log-likelihood graph is used and
+    `rv_value` is that graph's input parameter.  For example, if `rv_var` is
+    the output of a `NormalRV` `Op`, then the output is
+    ``normal_log_pdf(rv_value)``.
+
+    Parameters
+    ==========
+    rv_var
+        The `RandomVariable` output that determines the log-likelihood graph.
+    rv_value
+        The input variable for the log-likelihood graph.
+    jacobian
+        Whether or not to include the Jacobian term.
+    scaling
+        A scaling term to apply to the generated log-likelihood graph.
+
+    """
+
+    rv_var, rv_value = rv_log_likelihood_args(rv_var, rv_value)
+    rv_node = rv_var.owner
+
+    if not rv_node:
+        raise TypeError("rv_var must be the output of a RandomVariable Op")
+
+    if not isinstance(rv_node.op, RandomVariable):
+
+        if isinstance(rv_node.op, (Subtensor, AdvancedSubtensor, AdvancedSubtensor1)):
+
+            raise NotImplementedError("Missing value support is incomplete")
 
+            # "Flatten" and sum an array of indexed RVs' log-likelihoods
+            rv_var, missing_values = rv_node.inputs
+            rv_value = rv_var.tag.value_var
+
+            missing_values = missing_values.data
+            logp_var = aet.sum(
+                [
+                    logpt(
+                        rv_var,
+                    )
+                    for idx, missing in zip(
+                        np.ndindex(missing_values.shape), missing_values.flatten()
+                    )
+                    if missing
+                ]
+            )
+            return logp_var
+
+        return aet.zeros_like(rv_var)
+
+    rng, size, dtype, *dist_params = rv_node.inputs
+
+    dist_params = sample_to_measure_vars(dist_params)
+
+    if jacobian:
+        logp_var = _logp(rv_node.op, rv_value, *dist_params, **kwargs)
+    else:
+        logp_var = _logp_nojac(rv_node.op, rv_value, *dist_params, **kwargs)
+
+    # Replace `RandomVariable` ancestors with their corresponding
+    # log-likelihood input variables
+    lik_replacements = [
+        (v, v.tag.value_var)
+        for v in ancestors([logp_var])
+        if v.owner and isinstance(v.owner.op, RandomVariable) and getattr(v.tag, "value_var", None)
+    ]
+
+    (logp_var,) = clone_replace([logp_var], replace=lik_replacements)
+
+    if scaling:
+        logp_var *= _get_scaling(
+            getattr(rv_var.tag, "total_size", None), rv_value.shape, rv_value.ndim
+        )
+
+    if rv_var.name is not None:
+        logp_var.name = "__logp_%s" % rv_var.name
+
+    return logp_var
+
+
+@singledispatch
+def _logp(op, value, *dist_params, **kwargs):
+    """Create a log-likelihood graph.
+
+    This function dispatches on the type of `op`, which should be a subclass
+    of `RandomVariable`.  If you want to implement new log-likelihood graphs
+    for a `RandomVariable`, register a new function on this dispatcher.
+
+    """
+    return aet.zeros_like(value)
+
+
+def logcdf(rv_var, rv_value, **kwargs):
+    """Create a log-CDF graph."""
+
+    rv_var, rv_value = rv_log_likelihood_args(rv_var, rv_value)
+    rv_node = rv_var.owner
+
+    if not rv_node:
+        raise TypeError()
+
+    rng, size, dtype, *dist_params = rv_node.inputs
+
+    dist_params = sample_to_measure_vars(dist_params)
+
+    return _logcdf(rv_node.op, rv_value, *dist_params, **kwargs)
+
+
+@singledispatch
+def _logcdf(op, value, *args, **kwargs):
+    """Create a log-CDF graph.
+
+    This function dispatches on the type of `op`, which should be a subclass
+    of `RandomVariable`.  If you want to implement new log-CDF graphs
+    for a `RandomVariable`, register a new function on this dispatcher.
+
+    """
+    raise NotImplementedError()
+
+
+def logp_nojac(rv_var, rv_value=None, **kwargs):
+    """Create a graph of the log-likelihood that doesn't include the Jacobian."""
+
+    rv_var, rv_value = rv_log_likelihood_args(rv_var, rv_value)
+    rv_node = rv_var.owner
+
+    if not rv_node:
+        raise TypeError()
+
+    rng, size, dtype, *dist_params = rv_node.inputs
+
+    dist_params = sample_to_measure_vars(dist_params)
+
+    return _logp_nojac(rv_node.op, rv_value, **kwargs)
+
+
+@singledispatch
+def _logp_nojac(op, value, *args, **kwargs):
+    """Return the logp, but do not include a jacobian term for transforms.
+
+    If we use different parametrizations for the same distribution, we
+    need to add the determinant of the jacobian of the transformation
+    to make sure the densities still describe the same distribution.
+    However, MAP estimates are not invariant with respect to the
+    parameterization, we need to exclude the jacobian terms in this case.
+
+    This function should be overwritten in base classes for transformed
+    distributions.
+    """
+    return logpt(op, value, *args, **kwargs)
+
+
+def logpt_sum(rv_var: TensorVariable, rv_value: Optional[TensorVariable] = None, **kwargs):
+    """Return the sum of the logp values for the given observations.
+
+    Subclasses can use this to improve the speed of logp evaluations
+    if only the sum of the logp values is needed.
+    """
+    return aet.sum(logpt(rv_var, rv_value, **kwargs))
+
+
+# from pymc3.distributions import timeseries
+from pymc3.distributions import shape_utils, transforms
 from pymc3.distributions.bart import BART
 from pymc3.distributions.bound import Bound
 from pymc3.distributions.continuous import (
@@ -82,6 +443,8 @@
     Discrete,
     Distribution,
     NoDistribution,
+    draw_values,
+    generate_samples,
 )
 from pymc3.distributions.mixture import Mixture, MixtureSameFamily, NormalMixture
 from pymc3.distributions.multivariate import (
@@ -99,15 +462,15 @@
     WishartBartlett,
 )
 from pymc3.distributions.simulator import Simulator
-from pymc3.distributions.timeseries import (
-    AR,
-    AR1,
-    GARCH11,
-    GaussianRandomWalk,
-    MvGaussianRandomWalk,
-    MvStudentTRandomWalk,
-)
 
+# from pymc3.distributions.timeseries import (
+#     AR,
+#     AR1,
+#     GARCH11,
+#     GaussianRandomWalk,
+#     MvGaussianRandomWalk,
+#     MvStudentTRandomWalk,
+# )
 __all__ = [
     "Uniform",
     "Flat",
@@ -165,13 +528,13 @@
     "WishartBartlett",
     "LKJCholeskyCov",
     "LKJCorr",
-    "AR1",
-    "AR",
+    # "AR1",
+    # "AR",
     "AsymmetricLaplace",
-    "GaussianRandomWalk",
-    "MvGaussianRandomWalk",
-    "MvStudentTRandomWalk",
-    "GARCH11",
+    # "GaussianRandomWalk",
+    # "MvGaussianRandomWalk",
+    # "MvStudentTRandomWalk",
+    # "GARCH11",
     "SkewNormal",
     "Mixture",
     "NormalMixture",
diff --git a/pymc3/distributions/continuous.py b/pymc3/distributions/continuous.py
index f4efa97a07..23012dc9a1 100644
--- a/pymc3/distributions/continuous.py
+++ b/pymc3/distributions/continuous.py
@@ -17,19 +17,17 @@
 A collection of common probability distributions for stochastic
 nodes in PyMC.
 """
+from copy import copy
 
 import aesara.tensor as at
 import numpy as np
 
 from aesara.assert_op import Assert
 from aesara.tensor.random.basic import (
-    BetaRV,
-    cauchy,
-    exponential,
+    GammaRV,
+    NormalRV,
+    UniformRV,
     gamma,
-    halfcauchy,
-    halfnormal,
-    invgamma,
     normal,
     uniform,
 )
@@ -37,7 +35,7 @@
 from scipy.interpolate import InterpolatedUnivariateSpline
 
 from pymc3.aesaraf import floatX
-from pymc3.distributions import logp_transform, transforms
+from pymc3.distributions import _logcdf, _logp, transforms
 from pymc3.distributions.dist_math import (
     SplineWrapper,
     betaln,
@@ -91,44 +89,46 @@
     "AsymmetricLaplace",
 ]
 
+# FIXME: These are temporary hacks
+normal = copy(normal)
+normal.inplace = True
+uniform = copy(uniform)
+uniform.inplace = True
+gamma = copy(gamma)
+gamma.inplace = True
+
 
 class PositiveContinuous(Continuous):
     """Base class for positive continuous distributions"""
 
+    default_transform = transforms.log
+
 
 class UnitContinuous(Continuous):
     """Base class for continuous distributions on [0,1]"""
 
+    default_transform = transforms.logodds
+
 
 class BoundedContinuous(Continuous):
     """Base class for bounded continuous distributions"""
 
+    default_transform = "auto"
+
+    def create_transform(transform="auto", lower=None, upper=None):
 
 @logp_transform.register(PositiveContinuous)
 def pos_cont_transform(op):
     return transforms.log
 
 
-@logp_transform.register(UnitContinuous)
-def unit_cont_transform(op):
-    return transforms.logodds
-
-
-@logp_transform.register(BoundedContinuous)
-def bounded_cont_transform(op):
-    def transform_params(rv_var):
-        _, _, _, lower, upper = rv_var.owner.inputs
-        lower = at.as_tensor_variable(lower) if lower is not None else None
-        upper = at.as_tensor_variable(upper) if upper is not None else None
-        return lower, upper
-
-    return transforms.interval(transform_params)
+        return transform
 
 
 def assert_negative_support(var, label, distname, value=-1e-6):
     msg = f"The variable specified for {label} has negative support for {distname}, "
     msg += "likely making it unsuitable for this parameter."
-    return Assert(msg)(var, at.all(at.ge(var, 0.0)))
+    return Assert(msg)(var, aet.all(aet.ge(var, 0.0)))
 
 
 def get_tau_sigma(tau=None, sigma=None):
@@ -219,51 +219,59 @@ class Uniform(BoundedContinuous):
 
     @classmethod
     def dist(cls, lower=0, upper=1, **kwargs):
-        lower = at.as_tensor_variable(floatX(lower))
-        upper = at.as_tensor_variable(floatX(upper))
+        lower = aet.as_tensor_variable(floatX(lower))
+        upper = aet.as_tensor_variable(floatX(upper))
         # mean = (upper + lower) / 2.0
         # median = self.mean
-        return super().dist([lower, upper], **kwargs)
 
-    def logp(value, lower, upper):
-        """
-        Calculate log-probability of Uniform distribution at specified value.
+        transform = kwargs.pop("transform", cls.default_transform)
+        transform = cls.create_transform(transform, lower, upper)
 
-        Parameters
-        ----------
-        value: numeric
-            Value for which log-probability is calculated.
+        return super().dist([lower, upper], transform=transform, **kwargs)
 
-        Returns
-        -------
-        TensorVariable
-        """
-        return bound(-at.log(upper - lower), value >= lower, value <= upper)
 
-    def logcdf(value, lower, upper):
-        """
-        Compute the log of the cumulative distribution function for Uniform distribution
-        at the specified value.
+@_logp.register(UniformRV)
+def uniform_logp(op, value, lower, upper):
+    """
+    Calculate log-probability of Uniform distribution at specified value.
 
-        Parameters
-        ----------
-        value: numeric or np.ndarray or `TensorVariable`
-            Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or `TensorVariable`.
+    Parameters
+    ----------
+    value: numeric
+        Value for which log-probability is calculated.
 
-        Returns
-        -------
-        TensorVariable
-        """
-        return at.switch(
-            at.lt(value, lower) | at.lt(upper, lower),
-            -np.inf,
-            at.switch(
-                at.lt(value, upper),
-                at.log(value - lower) - at.log(upper - lower),
-                0,
-            ),
-        )
+    Returns
+    -------
+    TensorVariable
+    """
+    return bound(-aet.log(upper - lower), value >= lower, value <= upper)
+
+
+@_logcdf.register(UniformRV)
+def uniform_logcdf(op, value, lower, upper):
+    """
+    Compute the log of the cumulative distribution function for Uniform distribution
+    at the specified value.
+
+    Parameters
+    ----------
+    value: numeric or np.ndarray or `TensorVariable`
+        Value(s) for which log CDF is calculated. If the log CDF for multiple
+        values are desired the values must be provided in a numpy array or `TensorVariable`.
+
+    Returns
+    -------
+    TensorVariable
+    """
+    return aet.switch(
+        aet.lt(value, lower) | aet.lt(upper, lower),
+        -np.inf,
+        aet.switch(
+            aet.lt(value, upper),
+            aet.log(value - lower) - aet.log(upper - lower),
+            0,
+        ),
+    )
 
 
 class Flat(Continuous):
@@ -447,59 +455,61 @@ class Normal(Continuous):
     rv_op = normal
 
     @classmethod
-    def dist(cls, mu=0, sigma=None, tau=None, sd=None, no_assert=False, **kwargs):
+    def dist(cls, mu=0, sigma=None, tau=None, sd=None, **kwargs):
         if sd is not None:
             sigma = sd
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
-        sigma = at.as_tensor_variable(sigma)
+        sigma = aet.as_tensor_variable(sigma)
 
         # sd = sigma
-        # tau = at.as_tensor_variable(tau)
-        # mean = median = mode = mu = at.as_tensor_variable(floatX(mu))
+        # tau = aet.as_tensor_variable(tau)
+        # mean = median = mode = mu = aet.as_tensor_variable(floatX(mu))
         # variance = 1.0 / self.tau
 
-        if not no_assert:
-            assert_negative_support(sigma, "sigma", "Normal")
-
+        assert_negative_support(sigma, "sigma", "Normal")
         return super().dist([mu, sigma], **kwargs)
 
-    def logp(value, mu, sigma):
-        """
-        Calculate log-probability of Normal distribution at specified value.
 
-        Parameters
-        ----------
-        value: numeric
-            Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or `TensorVariable`.
+@_logp.register(NormalRV)
+def normal_logp(op, value, mu, sigma):
+    """
+    Calculate log-probability of Normal distribution at specified value.
 
-        Returns
-        -------
-        TensorVariable
-        """
-        tau, sigma = get_tau_sigma(tau=None, sigma=sigma)
+    Parameters
+    ----------
+    value: numeric
+        Value(s) for which log-probability is calculated. If the log probabilities for multiple
+        values are desired the values must be provided in a numpy array or `TensorVariable`.
 
-        return bound((-tau * (value - mu) ** 2 + at.log(tau / np.pi / 2.0)) / 2.0, sigma > 0)
+    Returns
+    -------
+    TensorVariable
+    """
+    tau, sigma = get_tau_sigma(tau=None, sigma=sigma)
 
-    def logcdf(value, mu, sigma):
-        """
-        Compute the log of the cumulative distribution function for Normal distribution
-        at the specified value.
+    return bound((-tau * (value - mu) ** 2 + aet.log(tau / np.pi / 2.0)) / 2.0, sigma > 0)
 
-        Parameters
-        ----------
-        value: numeric or np.ndarray or `TensorVariable`
-            Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or `TensorVariable`.
 
-        Returns
-        -------
-        TensorVariable
-        """
-        return bound(
-            normal_lcdf(mu, sigma, value),
-            0 < sigma,
-        )
+@_logcdf.register(NormalRV)
+def normal_logcdf(op, value, mu, sigma):
+    """
+    Compute the log of the cumulative distribution function for Normal distribution
+    at the specified value.
+
+    Parameters
+    ----------
+    value: numeric or np.ndarray or `TensorVariable`
+        Value(s) for which log CDF is calculated. If the log CDF for multiple
+        values are desired the values must be provided in a numpy array or `TensorVariable`.
+
+    Returns
+    -------
+    TensorVariable
+    """
+    return bound(
+        normal_lcdf(mu, sigma, value),
+        0 < sigma,
+    )
 
 
 class TruncatedNormal(BoundedContinuous):
@@ -2364,22 +2374,22 @@ class Gamma(PositiveContinuous):
     rv_op = gamma
 
     @classmethod
-    def dist(cls, alpha=None, beta=None, mu=None, sigma=None, sd=None, no_assert=False, **kwargs):
+    def dist(cls, alpha=None, beta=None, mu=None, sigma=None, sd=None, *args, **kwargs):
         if sd is not None:
             sigma = sd
 
         alpha, beta = cls.get_alpha_beta(alpha, beta, mu, sigma)
-        alpha = at.as_tensor_variable(floatX(alpha))
-        beta = at.as_tensor_variable(floatX(beta))
+        alpha = aet.as_tensor_variable(floatX(alpha))
+        beta = aet.as_tensor_variable(floatX(beta))
         # mean = alpha / beta
-        # mode = at.maximum((alpha - 1) / beta, 0)
+        # mode = aet.maximum((alpha - 1) / beta, 0)
         # variance = alpha / beta ** 2
 
         if not no_assert:
             assert_negative_support(alpha, "alpha", "Gamma")
             assert_negative_support(beta, "beta", "Gamma")
 
-        return super().dist([alpha, at.inv(beta)], **kwargs)
+        return super().dist([alpha, beta], **kwargs)
 
     @classmethod
     def get_alpha_beta(cls, alpha=None, beta=None, mu=None, sigma=None):
@@ -2400,53 +2410,57 @@ def get_alpha_beta(cls, alpha=None, beta=None, mu=None, sigma=None):
     def _distr_parameters_for_repr(self):
         return ["alpha", "beta"]
 
-    def logp(value, alpha, beta):
-        """
-        Calculate log-probability of Gamma distribution at specified value.
 
-        Parameters
-        ----------
-        value: numeric
-            Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or `TensorVariable`.
+@_logp.register(GammaRV)
+def gamma_logp(op, value, alpha, beta):
+    """
+    Calculate log-probability of Gamma distribution at specified value.
 
-        Returns
-        -------
-        TensorVariable
-        """
-        return bound(
-            -gammaln(alpha) + logpow(beta, alpha) - beta * value + logpow(value, alpha - 1),
-            value >= 0,
-            alpha > 0,
-            beta > 0,
-        )
+    Parameters
+    ----------
+    value: numeric
+        Value(s) for which log-probability is calculated. If the log probabilities for multiple
+        values are desired the values must be provided in a numpy array or `TensorVariable`.
 
-    def logcdf(value, alpha, beta):
-        """
-        Compute the log of the cumulative distribution function for Gamma distribution
-        at the specified value.
+    Returns
+    -------
+    TensorVariable
+    """
+    return bound(
+        -gammaln(alpha) + logpow(beta, alpha) - beta * value + logpow(value, alpha - 1),
+        value >= 0,
+        alpha > 0,
+        beta > 0,
+    )
 
-        Parameters
-        ----------
-        value: numeric or np.ndarray or `TensorVariable`
-            Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or `TensorVariable`.
 
-        Returns
-        -------
-        TensorVariable
-        """
-        # Avoid C-assertion when the gammainc function is called with invalid values (#4340)
-        safe_alpha = at.switch(at.lt(alpha, 0), 0, alpha)
-        safe_beta = at.switch(at.lt(beta, 0), 0, beta)
-        safe_value = at.switch(at.lt(value, 0), 0, value)
+@_logcdf.register(GammaRV)
+def gamma_logcdf(op, value, alpha, beta):
+    """
+    Compute the log of the cumulative distribution function for Gamma distribution
+    at the specified value.
 
-        return bound(
-            at.log(at.gammainc(safe_alpha, safe_beta * safe_value)),
-            0 <= value,
-            0 < alpha,
-            0 < beta,
-        )
+    Parameters
+    ----------
+    value: numeric or np.ndarray or `TensorVariable`
+        Value(s) for which log CDF is calculated. If the log CDF for multiple
+        values are desired the values must be provided in a numpy array or `TensorVariable`.
+
+    Returns
+    -------
+    TensorVariable
+    """
+    # Avoid C-assertion when the gammainc function is called with invalid values (#4340)
+    safe_alpha = aet.switch(aet.lt(alpha, 0), 0, alpha)
+    safe_beta = aet.switch(aet.lt(beta, 0), 0, beta)
+    safe_value = aet.switch(aet.lt(value, 0), 0, value)
+
+    return bound(
+        aet.log(aet.gammainc(safe_alpha, safe_beta * safe_value)),
+        0 <= value,
+        0 < alpha,
+        0 < beta,
+    )
 
 
 class InverseGamma(PositiveContinuous):
diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py
index 31cf813a5d..0fa23ae6ce 100644
--- a/pymc3/distributions/discrete.py
+++ b/pymc3/distributions/discrete.py
@@ -13,13 +13,16 @@
 #   limitations under the License.
 import warnings
 
-import aesara.tensor as at
+from copy import copy
+
+import aesara.tensor as aet
 import numpy as np
 
-from aesara.tensor.random.basic import bernoulli, binomial, categorical, nbinom, poisson
+from aesara.tensor.random.basic import BinomialRV, CategoricalRV, binomial, categorical
 from scipy import stats
 
 from pymc3.aesaraf import floatX, intX, take_along_axis
+from pymc3.distributions import _logcdf, _logp
 from pymc3.distributions.dist_math import (
     betaln,
     binomln,
@@ -53,6 +56,12 @@
     "OrderedLogistic",
 ]
 
+# FIXME: These are temporary hacks
+categorical = copy(categorical)
+categorical.inplace = True
+binomial = copy(binomial)
+binomial.inplace = True
+
 
 class Binomial(Discrete):
     R"""
@@ -100,67 +109,71 @@ class Binomial(Discrete):
 
     @classmethod
     def dist(cls, n, p, *args, **kwargs):
-        n = at.as_tensor_variable(intX(n))
-        p = at.as_tensor_variable(floatX(p))
-        # mode = at.cast(tround(n * p), self.dtype)
+        n = aet.as_tensor_variable(intX(n))
+        p = aet.as_tensor_variable(floatX(p))
+        # mode = aet.cast(tround(n * p), self.dtype)
         return super().dist([n, p], **kwargs)
 
-    def logp(value, n, p):
-        r"""
-        Calculate log-probability of Binomial distribution at specified value.
 
-        Parameters
-        ----------
-        value: numeric
-            Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or aesara tensor
+@_logp.register(BinomialRV)
+def binomial_logp(op, value, n, p):
+    r"""
+    Calculate log-probability of Binomial distribution at specified value.
 
-        Returns
-        -------
-        TensorVariable
-        """
-        return bound(
-            binomln(n, value) + logpow(p, value) + logpow(1 - p, n - value),
-            0 <= value,
-            value <= n,
-            0 <= p,
-            p <= 1,
-        )
+    Parameters
+    ----------
+    value: numeric
+        Value(s) for which log-probability is calculated. If the log probabilities for multiple
+        values are desired the values must be provided in a numpy array or aesara tensor
 
-    def logcdf(value, n, p):
-        """
-        Compute the log of the cumulative distribution function for Binomial distribution
-        at the specified value.
+    Returns
+    -------
+    TensorVariable
+    """
+    return bound(
+        binomln(n, value) + logpow(p, value) + logpow(1 - p, n - value),
+        0 <= value,
+        value <= n,
+        0 <= p,
+        p <= 1,
+    )
 
-        Parameters
-        ----------
-        value: numeric
-            Value for which log CDF is calculated.
 
-        Returns
-        -------
-        TensorVariable
-        """
-        # incomplete_beta function can only handle scalar values (see #4342)
-        if np.ndim(value):
-            raise TypeError(
-                f"Binomial.logcdf expects a scalar value but received a {np.ndim(value)}-dimensional object."
-            )
+@_logcdf.register(BinomialRV)
+def binomial_logcdf(op, value, n, p):
+    """
+    Compute the log of the cumulative distribution function for Binomial distribution
+    at the specified value.
 
-        value = at.floor(value)
+    Parameters
+    ----------
+    value: numeric
+        Value for which log CDF is calculated.
 
-        return bound(
-            at.switch(
-                at.lt(value, n),
-                at.log(incomplete_beta(n - value, value + 1, 1 - p)),
-                0,
-            ),
-            0 <= value,
-            0 < n,
-            0 <= p,
-            p <= 1,
+    Returns
+    -------
+    TensorVariable
+    """
+    # incomplete_beta function can only handle scalar values (see #4342)
+    if np.ndim(value):
+        raise TypeError(
+            f"Binomial.logcdf expects a scalar value but received a {np.ndim(value)}-dimensional object."
         )
 
+    value = aet.floor(value)
+
+    return bound(
+        aet.switch(
+            aet.lt(value, n),
+            aet.log(incomplete_beta(n - value, value + 1, 1 - p)),
+            0,
+        ),
+        0 <= value,
+        0 < n,
+        0 <= p,
+        p <= 1,
+    )
+
 
 class BetaBinomial(Discrete):
     R"""
@@ -1238,48 +1251,49 @@ class Categorical(Discrete):
     @classmethod
     def dist(cls, p, **kwargs):
 
-        p = at.as_tensor_variable(floatX(p))
+        p = aet.as_tensor_variable(floatX(p))
 
-        # mode = at.argmax(p, axis=-1)
+        # mode = aet.argmax(p, axis=-1)
         # if mode.ndim == 1:
-        #     mode = at.squeeze(mode)
+        #     mode = aet.squeeze(mode)
 
         return super().dist([p], **kwargs)
 
-    def logp(value, p):
-        r"""
-        Calculate log-probability of Categorical distribution at specified value.
 
-        Parameters
-        ----------
-        value: numeric
-            Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or `TensorVariable`
+@_logp.register(CategoricalRV)
+def categorical_logp(op, value, p_, upper):
+    r"""
+    Calculate log-probability of Categorical distribution at specified value.
 
-        """
-        k = at.shape(p)[-1]
-        p_ = p
-        p = p_ / at.sum(p_, axis=-1, keepdims=True)
-        value_clip = at.clip(value, 0, k - 1)
+    Parameters
+    ----------
+    value: numeric
+        Value(s) for which log-probability is calculated. If the log probabilities for multiple
+        values are desired the values must be provided in a numpy array or `TensorVariable`
 
-        if p.ndim > 1:
-            if p.ndim > value_clip.ndim:
-                value_clip = at.shape_padleft(value_clip, p_.ndim - value_clip.ndim)
-            elif p.ndim < value_clip.ndim:
-                p = at.shape_padleft(p, value_clip.ndim - p_.ndim)
-            pattern = (p.ndim - 1,) + tuple(range(p.ndim - 1))
-            a = at.log(
-                take_along_axis(
-                    p.dimshuffle(pattern),
-                    value_clip,
-                )
+    """
+    p = p_ / aet.sum(p_, axis=-1, keepdims=True)
+    k = aet.shape(p_)[-1]
+    value_clip = aet.clip(value, 0, k - 1)
+
+    if p.ndim > 1:
+        if p.ndim > value_clip.ndim:
+            value_clip = aet.shape_padleft(value_clip, p_.ndim - value_clip.ndim)
+        elif p.ndim < value_clip.ndim:
+            p = aet.shape_padleft(p, value_clip.ndim - p_.ndim)
+        pattern = (p.ndim - 1,) + tuple(range(p.ndim - 1))
+        a = aet.log(
+            take_along_axis(
+                p.dimshuffle(pattern),
+                value_clip,
             )
-        else:
-            a = at.log(p[value_clip])
-
-        return bound(
-            a, value >= 0, value <= (k - 1), at.all(p_ >= 0, axis=-1), at.all(p <= 1, axis=-1)
         )
+    else:
+        a = aet.log(p[value_clip])
+
+    return bound(
+        a, value >= 0, value <= (k - 1), aet.all(p_ >= 0, axis=-1), aet.all(p <= 1, axis=-1)
+    )
 
 
 class Constant(Discrete):
diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py
index e3f5893718..dee5ceb129 100644
--- a/pymc3/distributions/distribution.py
+++ b/pymc3/distributions/distribution.py
@@ -33,9 +33,29 @@
 
 import aesara
 import aesara.graph.basic
-import aesara.tensor as at
-
-from pymc3.util import UNSET, get_repr_for_variable
+import aesara.tensor as aet
+import numpy as np
+
+from aesara import function
+from aesara.compile.sharedvalue import SharedVariable
+from aesara.graph.basic import Constant
+from aesara.tensor.var import TensorVariable
+from cachetools import LRUCache, cached
+
+from pymc3.distributions.shape_utils import (
+    broadcast_dist_samples_shape,
+    get_broadcastable_dist_samples,
+    to_tuple,
+)
+from pymc3.model import (
+    ContextMeta,
+    FreeRV,
+    Model,
+    MultiObservedRV,
+    ObservedRV,
+    build_named_node_tree,
+)
+from pymc3.util import get_repr_for_variable, get_var_name, hash_key
 from pymc3.vartypes import string_types
 
 __all__ = [
@@ -44,6 +64,8 @@
     "Continuous",
     "Discrete",
     "NoDistribution",
+    "draw_values",
+    "generate_samples",
 ]
 
 vectorized_ppc = contextvars.ContextVar(
@@ -125,8 +147,8 @@ def logcdf(op, var, rvs_to_values, *dist_params, **kwargs):
 class Distribution(metaclass=DistributionMeta):
     """Statistical distribution"""
 
-    rv_class = None
     rv_op = None
+    default_transform = None
 
     def __new__(cls, name, *args, **kwargs):
         try:
@@ -151,6 +173,9 @@ def __new__(cls, name, *args, **kwargs):
 
         data = kwargs.pop("observed", None)
 
+        if isinstance(data, ObservedRV) or isinstance(data, FreeRV):
+            raise TypeError("observed needs to be data but got: {}".format(type(data)))
+
         total_size = kwargs.pop("total_size", None)
 
         dims = kwargs.pop("dims", None)
@@ -158,11 +183,41 @@ def __new__(cls, name, *args, **kwargs):
         if "shape" in kwargs:
             raise DeprecationWarning("The `shape` keyword is deprecated; use `size`.")
 
-        transform = kwargs.pop("transform", UNSET)
-
         rv_out = cls.dist(*args, rng=rng, **kwargs)
 
-        return model.register_rv(rv_out, name, data, total_size, dims=dims, transform=transform)
+        return model.register_rv(rv_out, name, data, total_size, dims=dims)
+
+    @classmethod
+    def dist(cls, dist_params, **kwargs):
+        transform = kwargs.pop("transform", cls.default_transform)
+        testval = kwargs.pop("testval", None)
+
+        rv_var = cls.rv_op(*dist_params, **kwargs)
+
+        rv_var.tag.transform = transform
+
+        if testval is not None:
+            rv_var.tag.test_value = testval
+
+        return rv_var
+
+    def default(self):
+        return np.asarray(self.get_test_val(self.testval, self.defaults), self.dtype)
+
+    def get_test_val(self, val, defaults):
+        if val is None:
+            for v in defaults:
+                if hasattr(self, v):
+                    attr_val = self.getattr_value(v)
+                    if np.all(np.isfinite(attr_val)):
+                        return attr_val
+            raise AttributeError(
+                "%s has no finite default value to use, "
+                "checked: %s. Pass testval argument or "
+                "adjust so value is finite." % (self, str(defaults))
+            )
+        else:
+            return self.getattr_value(val)
 
     @classmethod
     def dist(cls, dist_params, **kwargs):
diff --git a/pymc3/distributions/multivariate.py b/pymc3/distributions/multivariate.py
index 4eb6b01817..17bc671a29 100644
--- a/pymc3/distributions/multivariate.py
+++ b/pymc3/distributions/multivariate.py
@@ -17,6 +17,8 @@
 
 import warnings
 
+from copy import copy
+
 import aesara
 import aesara.tensor as at
 import numpy as np
@@ -25,8 +27,7 @@
 from aesara.graph.basic import Apply
 from aesara.graph.op import Op
 from aesara.tensor.nlinalg import det, eigh, matrix_inverse, trace
-from aesara.tensor.random.basic import MultinomialRV, dirichlet, multivariate_normal
-from aesara.tensor.random.utils import broadcast_params
+from aesara.tensor.random.basic import DirichletRV, dirichlet
 from aesara.tensor.slinalg import (
     Cholesky,
     Solve,
@@ -39,7 +40,7 @@
 import pymc3 as pm
 
 from pymc3.aesaraf import floatX, intX
-from pymc3.distributions import transforms
+from pymc3.distributions import _logp, transforms
 from pymc3.distributions.continuous import ChiSquared, Normal
 from pymc3.distributions.dist_math import bound, factln, logpow
 from pymc3.distributions.distribution import Continuous, Discrete
@@ -61,51 +62,10 @@
     "CAR",
 ]
 
-solve_lower = Solve(A_structure="lower_triangular")
-# Step methods and advi do not catch LinAlgErrors at the
-# moment. We work around that by using a cholesky op
-# that returns a nan as first entry instead of raising
-# an error.
-cholesky = Cholesky(lower=True, on_error="nan")
-
-
-def quaddist_matrix(cov=None, chol=None, tau=None, lower=True, *args, **kwargs):
-    if chol is not None and not lower:
-        chol = chol.T
+# FIXME: These are temporary hacks
+dirichlet = copy(dirichlet)
+dirichlet.inplace = True
 
-    if len([i for i in [tau, cov, chol] if i is not None]) != 1:
-        raise ValueError("Incompatible parameterization. Specify exactly one of tau, cov, or chol.")
-
-    if cov is not None:
-        cov = at.as_tensor_variable(cov)
-        if cov.ndim != 2:
-            raise ValueError("cov must be two dimensional.")
-    elif tau is not None:
-        tau = at.as_tensor_variable(tau)
-        if tau.ndim != 2:
-            raise ValueError("tau must be two dimensional.")
-        # TODO: What's the correct order/approach (in the non-square case)?
-        # `aesara.tensor.nlinalg.tensorinv`?
-        cov = matrix_inverse(tau)
-    else:
-        # TODO: What's the correct order/approach (in the non-square case)?
-        chol = at.as_tensor_variable(chol)
-        if chol.ndim != 2:
-            raise ValueError("chol must be two dimensional.")
-        cov = chol.dot(chol.T)
-
-    return cov
-
-
-def quaddist_parse(value, mu, cov, mat_type="cov"):
-    """Compute (x - mu).T @ Sigma^-1 @ (x - mu) and the logdet of Sigma."""
-    if value.ndim > 2 or value.ndim == 0:
-        raise ValueError("Invalid dimension for value: %s" % value.ndim)
-    if value.ndim == 1:
-        onedim = True
-        value = value[None, :]
-    else:
-        onedim = False
 
     delta = value - mu
 
@@ -386,45 +346,44 @@ class Dirichlet(Continuous):
     """
 
     rv_op = dirichlet
-
-    def __new__(cls, name, *args, **kwargs):
-        kwargs.setdefault("transform", transforms.stick_breaking)
-        return super().__new__(cls, name, *args, **kwargs)
+    default_transform = transforms.stick_breaking
 
     @classmethod
     def dist(cls, a, **kwargs):
 
-        a = at.as_tensor_variable(a)
-        # mean = a / at.sum(a)
-        # mode = at.switch(at.all(a > 1), (a - 1) / at.sum(a - 1), np.nan)
+        a = aet.as_tensor_variable(a)
+        # mean = a / aet.sum(a)
+        # mode = aet.switch(aet.all(a > 1), (a - 1) / aet.sum(a - 1), np.nan)
 
         return super().dist([a], **kwargs)
 
-    def logp(value, a):
-        """
-        Calculate log-probability of Dirichlet distribution
-        at specified value.
+    def _distr_parameters_for_repr(self):
+        return ["a"]
 
-        Parameters
-        ----------
-        value: numeric
-            Value for which log-probability is calculated.
 
-        Returns
-        -------
-        TensorVariable
-        """
-        # only defined for sum(value) == 1
-        return bound(
-            at.sum(logpow(value, a - 1) - gammaln(a), axis=-1) + gammaln(at.sum(a, axis=-1)),
-            at.all(value >= 0),
-            at.all(value <= 1),
-            at.all(a > 0),
-            broadcast_conditions=False,
-        )
+@_logp.register(DirichletRV)
+def dirichlet_logp(op, value, a):
+    """
+    Calculate log-probability of Dirichlet distribution
+    at specified value.
 
-    def _distr_parameters_for_repr(self):
-        return ["a"]
+    Parameters
+    ----------
+    value: numeric
+        Value for which log-probability is calculated.
+
+    Returns
+    -------
+    TensorVariable
+    """
+    # only defined for sum(value) == 1
+    return bound(
+        aet.sum(logpow(value, a - 1) - gammaln(a), axis=-1) + gammaln(aet.sum(a, axis=-1)),
+        aet.all(value >= 0),
+        aet.all(value <= 1),
+        aet.all(a > 0),
+        broadcast_conditions=False,
+    )
 
 
 class MultinomialRV(MultinomialRV):
diff --git a/pymc3/distributions/transforms.py b/pymc3/distributions/transforms.py
index 86dfec050e..34a845665e 100644
--- a/pymc3/distributions/transforms.py
+++ b/pymc3/distributions/transforms.py
@@ -15,9 +15,10 @@
 import aesara.tensor as at
 
 from aesara.tensor.subtensor import advanced_set_subtensor1
-from aesara.tensor.var import TensorVariable
+from aesara.tensor.type import TensorType
 
 from pymc3.aesaraf import floatX, gradient
+from pymc3.distributions import distribution
 from pymc3.math import invlogit, logit, logsumexp
 
 __all__ = [
@@ -62,28 +63,10 @@ def forward(self, rv_var: TensorVariable, rv_value: TensorVariable) -> TensorVar
         `rv_var`, it will transform the random variable `rv_value` after
         sampling from `rv_var`.
 
-        **Do not apply transforms to `rv_var`.**  `rv_var` is only provided
-        as a means of describing the random variable associated with `rv_value`.
-        `rv_value` is the variable that should be transformed, and the transform
-        can use information from `rv_var`--within `param_extract_fn`--to do
-        that (e.g. the random variable's parameters via `rv_var.owner.inputs`).
-
-        Parameters
-        ----------
-        rv_var
-            The random variable.
-        rv_value
-            The variable representing a value of `rv_var`.
-
-        Returns
-        --------
-        tensor
-            Transformed tensor.
-        """
-        raise NotImplementedError
-
-    def backward(self, rv_var: TensorVariable, rv_value: TensorVariable) -> TensorVariable:
-        """Applies inverse of transformation.
+    def backward(self, z):
+        """Applies inverse of transformation to input variable `z`.
+        When transform is used on some distribution `p`, which has observed values `z`, it is used to
+        transform the values of `z` correctly to the support of `p`.
 
         Parameters
         ----------
@@ -138,8 +121,8 @@ def backward(self, rv_var, rv_value):
     def forward(self, rv_var, rv_value):
         return at.log(rv_value)
 
-    def jacobian_det(self, rv_var, rv_value):
-        return rv_value
+    def jacobian_det(self, x):
+        return x
 
 
 log = Log()
@@ -157,10 +140,10 @@ def forward(self, rv_var, rv_value):
         y = Log(Exp(x) - 1)
           = Log(1 - Exp(-x)) + x
         """
-        return at.log(1.0 - at.exp(-rv_value)) + rv_value
+        return aet.log(1.0 - aet.exp(-x)) + x
 
-    def jacobian_det(self, rv_var, rv_value):
-        return -at.nnet.softplus(-rv_value)
+    def jacobian_det(self, x):
+        return -aet.nnet.softplus(-x)
 
 
 log_exp_m1 = LogExpM1()
@@ -172,9 +155,6 @@ class LogOdds(ElemwiseTransform):
     def backward(self, rv_var, rv_value):
         return invlogit(rv_value, 0.0)
 
-    def forward(self, rv_var, rv_value):
-        return logit(rv_value)
-
 
 logodds = LogOdds()
 
@@ -184,46 +164,84 @@ class Interval(ElemwiseTransform):
 
     name = "interval"
 
-    def __init__(self, param_extract_fn):
-        self.param_extract_fn = param_extract_fn
-
-    def backward(self, rv_var, rv_value):
-        a, b = self.param_extract_fn(rv_var)
-
-        if a is not None and b is not None:
-            sigmoid_x = at.nnet.sigmoid(rv_value)
-            return sigmoid_x * b + (1 - sigmoid_x) * a
-        elif a is not None:
-            return at.exp(rv_value) + a
-        elif b is not None:
-            return b - at.exp(rv_value)
-        else:
-            return rv_value
+    def __init__(self, a, b):
+        self.a = aet.as_tensor_variable(a)
+        self.b = aet.as_tensor_variable(b)
 
-    def forward(self, rv_var, rv_value):
-        a, b = self.param_extract_fn(rv_var)
-        if a is not None and b is not None:
-            return at.log(rv_value - a) - at.log(b - rv_value)
-        elif a is not None:
-            return at.log(rv_value - a)
-        elif b is not None:
-            return at.log(b - rv_value)
-        else:
-            return rv_value
+    def backward(self, x):
+        a, b = self.a, self.b
+        sigmoid_x = aet.nnet.sigmoid(x)
+        r = sigmoid_x * b + (1 - sigmoid_x) * a
+        return r
 
-    def jacobian_det(self, rv_var, rv_value):
-        a, b = self.param_extract_fn(rv_var)
+    def forward(self, x):
+        a, b = self.a, self.b
+        return aet.log(x - a) - aet.log(b - x)
 
-        if a is not None and b is not None:
-            s = at.nnet.softplus(-rv_value)
-            return at.log(b - a) - 2 * s - rv_value
-        else:
-            return rv_value
+    def jacobian_det(self, x):
+        s = aet.nnet.softplus(-x)
+        return aet.log(self.b - self.a) - 2 * s - x
 
 
 interval = Interval
 
 
+class LowerBound(ElemwiseTransform):
+    """Transform from real line interval [a,inf] to whole real line."""
+
+    name = "lowerbound"
+
+    def __init__(self, a):
+        self.a = aet.as_tensor_variable(a)
+
+    def backward(self, x):
+        a = self.a
+        r = aet.exp(x) + a
+        return r
+
+    def forward(self, x):
+        a = self.a
+        return aet.log(x - a)
+
+    def jacobian_det(self, x):
+        return x
+
+
+lowerbound = LowerBound
+"""
+Alias for ``LowerBound`` (:class: LowerBound) Transform (:class: Transform) class
+for use in the ``transform`` argument of a random variable.
+"""
+
+
+class UpperBound(ElemwiseTransform):
+    """Transform from real line interval [-inf,b] to whole real line."""
+
+    name = "upperbound"
+
+    def __init__(self, b):
+        self.b = aet.as_tensor_variable(b)
+
+    def backward(self, x):
+        b = self.b
+        r = b - aet.exp(x)
+        return r
+
+    def forward(self, x):
+        b = self.b
+        return aet.log(b - x)
+
+    def jacobian_det(self, x):
+        return x
+
+
+upperbound = UpperBound
+"""
+Alias for ``UpperBound`` (:class: UpperBound) Transform (:class: Transform) class
+for use in the ``transform`` argument of a random variable.
+"""
+
+
 class Ordered(Transform):
     name = "ordered"
 
@@ -233,14 +251,8 @@ def backward(self, rv_var, rv_value):
         x = at.inc_subtensor(x[..., 1:], at.exp(rv_value[..., 1:]))
         return at.cumsum(x, axis=-1)
 
-    def forward(self, rv_var, rv_value):
-        y = at.zeros(rv_value.shape)
-        y = at.inc_subtensor(y[..., 0], rv_value[..., 0])
-        y = at.inc_subtensor(y[..., 1:], at.log(rv_value[..., 1:] - rv_value[..., :-1]))
-        return y
-
-    def jacobian_det(self, rv_var, rv_value):
-        return at.sum(rv_value[..., 1:], axis=-1)
+    def jacobian_det(self, y):
+        return aet.sum(y[..., 1:], axis=-1)
 
 
 ordered = Ordered()
@@ -265,9 +277,9 @@ def backward(self, rv_var, rv_value):
     def forward(self, rv_var, rv_value):
         return rv_value[..., :-1]
 
-    def jacobian_det(self, rv_var, rv_value):
-        y = at.zeros(rv_value.shape)
-        return at.sum(y, axis=-1)
+    def jacobian_det(self, x):
+        y = aet.zeros(x.shape)
+        return aet.sum(y, axis=-1)
 
 
 sum_to_1 = SumTo1()
@@ -298,14 +310,9 @@ def forward(self, rv_var, rv_value):
         y = lx[:-1] - shift
         return floatX(y.T)
 
-    def backward(self, rv_var, rv_value):
-        if rv_var.broadcastable[-1]:
-            # If this variable is just a bunch of scalars/degenerate
-            # Dirichlets, we can't transform it
-            return rv_value
-
-        y = rv_value.T
-        y = at.concatenate([y, -at.sum(y, 0, keepdims=True)])
+    def backward(self, y_):
+        y = y_.T
+        y = aet.concatenate([y, -aet.sum(y, 0, keepdims=True)])
         # "softmax" with vector support and no deprication warning:
         e_y = at.exp(y - at.max(y, 0, keepdims=True))
         x = e_y / at.sum(e_y, 0, keepdims=True)
@@ -337,11 +344,8 @@ class Circular(ElemwiseTransform):
     def backward(self, rv_var, rv_value):
         return at.arctan2(at.sin(rv_value), at.cos(rv_value))
 
-    def forward(self, rv_var, rv_value):
-        return at.as_tensor_variable(rv_value)
-
-    def jacobian_det(self, rv_var, rv_value):
-        return at.zeros(rv_value.shape)
+    def jacobian_det(self, x):
+        return aet.zeros(x.shape)
 
 
 circular = Circular()
@@ -357,13 +361,8 @@ def backward(self, rv_var, rv_value):
         diag_idxs = self.param_extract_fn(rv_var)
         return advanced_set_subtensor1(rv_value, at.exp(rv_value[diag_idxs]), diag_idxs)
 
-    def forward(self, rv_var, rv_value):
-        diag_idxs = self.param_extract_fn(rv_var)
-        return advanced_set_subtensor1(rv_value, at.log(rv_value[diag_idxs]), diag_idxs)
-
-    def jacobian_det(self, rv_var, rv_value):
-        diag_idxs = self.param_extract_fn(rv_var)
-        return at.sum(rv_value[diag_idxs])
+    def jacobian_det(self, y):
+        return aet.sum(y[self.diag_idxs])
 
 
 class Chain(Transform):
@@ -374,14 +373,14 @@ def __init__(self, transform_list):
         self.transform_list = transform_list
         self.name = "+".join([transf.name for transf in self.transform_list])
 
-    def forward(self, rv_var, rv_value):
-        y = rv_value
+    def forward(self, x):
+        y = x
         for transf in self.transform_list:
-            y = transf.forward(rv_var, y)
+            y = transf.forward(y)
         return y
 
-    def backward(self, rv_var, rv_value):
-        x = rv_value
+    def backward(self, y):
+        x = y
         for transf in reversed(self.transform_list):
             x = transf.backward(rv_var, x)
         return x
diff --git a/pymc3/model.py b/pymc3/model.py
index 9cccde8b30..8d771afc68 100644
--- a/pymc3/model.py
+++ b/pymc3/model.py
@@ -29,22 +29,19 @@
 
 from aesara.compile.sharedvalue import SharedVariable
 from aesara.gradient import grad
-from aesara.graph.basic import Constant, Variable, graph_inputs
+from aesara.graph.basic import Apply, Variable
+from aesara.tensor.random.op import Observed, observed
+from aesara.tensor.type import TensorType
 from aesara.tensor.var import TensorVariable
 from pandas import Series
 
-from pymc3.aesaraf import (
-    change_rv_size,
-    gradient,
-    hessian,
-    inputvars,
-    pandas_to_array,
-    rvs_to_value_vars,
-)
-from pymc3.blocking import DictToArrayBijection, RaveledVars
+import pymc3 as pm
+
+from pymc3.aesaraf import generator, gradient, hessian, inputvars
+from pymc3.blocking import ArrayOrdering, DictToArrayBijection
 from pymc3.data import GenTensorVariable, Minibatch
-from pymc3.distributions import logp_transform, logpt, logpt_sum
-from pymc3.exceptions import ImputationWarning, SamplingError
+from pymc3.distributions import _get_scaling, change_rv_size, logpt, logpt_sum
+from pymc3.exceptions import ImputationWarning
 from pymc3.math import flatten_list
 from pymc3.util import UNSET, WithMemoization, get_var_name, treedict, treelist
 from pymc3.vartypes import continuous_types, discrete_types, typefilter
@@ -380,10 +377,10 @@ def __init__(
         compute_grads=True,
         **kwargs,
     ):
-        if extra_vars_and_values is None:
-            extra_vars_and_values = {}
+        if extra_vars is None:
+            extra_vars = []
 
-        names = [arg.name for arg in grad_vars + list(extra_vars_and_values.keys())]
+        names = [arg.name for arg in grad_vars + extra_vars]
         if any(name is None for name in names):
             raise ValueError("Arguments must be named.")
         if len(set(names)) != len(names):
@@ -424,8 +421,14 @@ def __init__(
 
         givens = []
         self._extra_vars_shared = {}
-        for var, value in extra_vars_and_values.items():
-            shared = aesara.shared(value, var.name + "_shared__")
+        for var in extra_vars:
+            shared = aesara.shared(var.tag.test_value, var.name + "_shared__")
+            # test TensorType compatibility
+            if hasattr(var.tag.test_value, "shape"):
+                testtype = TensorType(var.dtype, [s == 1 for s in var.tag.test_value.shape])
+
+                if testtype != shared.type:
+                    shared.type = testtype
             self._extra_vars_shared[var.name] = shared
             givens.append((var, shared))
 
@@ -597,7 +600,10 @@ def __new__(cls, *args, **kwargs):
             instance._parent = kwargs.get("model")
         else:
             instance._parent = cls.get_context(error_if_none=False)
-        instance._aesara_config = kwargs.get("aesara_config", {})
+        aesara_config = kwargs.get("aesara_config", None)
+        if aesara_config is None or "compute_test_value" not in aesara_config:
+            aesara_config = {"compute_test_value": "ignore"}
+        instance._aesara_config = aesara_config
         return instance
 
     def __init__(self, name="", model=None, aesara_config=None, coords=None, check_bounds=True):
@@ -649,7 +655,18 @@ def isroot(self):
 
     @property
     def ndim(self):
-        return sum(var.ndim for var in self.value_vars)
+        return sum(var.ndim for var in self.free_RVs)
+
+    @property
+    def logp_array(self):
+        return self.bijection.mapf(self.fastlogp)
+
+    @property
+    def dlogp_array(self):
+        logpt = self.logpt
+        vars = inputvars(logpt)
+        dlogp = self.fastfn(gradient(self.logpt, vars))
+        return self.bijection.mapf(dlogp)
 
     def logp_dlogp_function(self, grad_vars=None, tempered=False, **kwargs):
         """Compile a aesara function that computes logp and gradient.
@@ -675,20 +692,13 @@ def logp_dlogp_function(self, grad_vars=None, tempered=False, **kwargs):
 
         if tempered:
             with self:
-                # Convert random variables into their log-likelihood inputs and
-                # apply their transforms, if any
-                potentials, _ = rvs_to_value_vars(self.potentials, apply_transforms=True)
-
-                free_RVs_logp = at.sum(
+                free_RVs_logp = aet.sum(
                     [
-                        at.sum(logpt(var, getattr(var.tag, "value_var", None)))
-                        for var in self.free_RVs
+                        aet.sum(logpt(var, getattr(var.tag, "value_var", None)))
+                        for var in self.free_RVs + self.potentials
                     ]
-                    + list(potentials)
-                )
-                observed_RVs_logp = at.sum(
-                    [at.sum(logpt(obs, obs.tag.observations)) for obs in self.observed_RVs]
                 )
+                observed_RVs_logp = aet.sum([aet.sum(logpt(obs)) for obs in self.observed_RVs])
 
             costs = [free_RVs_logp, observed_RVs_logp]
         else:
@@ -708,15 +718,9 @@ def logpt(self):
         """Aesara scalar of log-probability of the model"""
         with self:
             factors = [logpt_sum(var, getattr(var.tag, "value_var", None)) for var in self.free_RVs]
-            factors += [logpt_sum(obs, obs.tag.observations) for obs in self.observed_RVs]
-
-            # Convert random variables into their log-likelihood inputs and
-            # apply their transforms, if any
-            potentials, _ = rvs_to_value_vars(self.potentials, apply_transforms=True)
-
-            factors += potentials
-
-            logp_var = at.sum([at.sum(factor) for factor in factors])
+            factors += [logpt_sum(obs) for obs in self.observed_RVs]
+            factors += self.potentials
+            logp_var = aet.sum([aet.sum(factor) for factor in factors])
             if self.name:
                 logp_var.name = "__logp_%s" % self.name
             else:
@@ -736,17 +740,9 @@ def logp_nojact(self):
                 logpt_sum(var, getattr(var.tag, "value_var", None), jacobian=False)
                 for var in self.free_RVs
             ]
-            factors += [
-                logpt_sum(obs, obs.tag.observations, jacobian=False) for obs in self.observed_RVs
-            ]
-
-            # Convert random variables into their log-likelihood inputs and
-            # apply their transforms, if any
-            potentials, _ = rvs_to_value_vars(self.potentials, apply_transforms=True)
-            factors += potentials
-
-            logp_var = at.sum([at.sum(factor) for factor in factors])
-
+            factors += [logpt_sum(obs, jacobian=False) for obs in self.observed_RVs]
+            factors += self.potentials
+            logp_var = aet.sum([aet.sum(factor) for factor in factors])
             if self.name:
                 logp_var.name = "__logp_nojac_%s" % self.name
             else:
@@ -759,30 +755,17 @@ def varlogpt(self):
         (excluding deterministic)."""
         with self:
             factors = [logpt_sum(var, getattr(var.tag, "value_var", None)) for var in self.free_RVs]
-            return at.sum(factors)
+            return aet.sum(factors)
 
     @property
     def datalogpt(self):
         with self:
-            factors = [logpt(obs, obs.tag.observations) for obs in self.observed_RVs]
-
-            # Convert random variables into their log-likelihood inputs and
-            # apply their transforms, if any
-            potentials, _ = rvs_to_value_vars(self.potentials, apply_transforms=True)
-
-            factors += [at.sum(factor) for factor in potentials]
-            return at.sum(factors)
+            factors = [logpt(obs) for obs in self.observed_RVs]
+            factors += [aet.sum(factor) for factor in self.potentials]
+            return aet.sum(factors)
 
     @property
     def vars(self):
-        warnings.warn(
-            "Model.vars has been deprecated. Use Model.value_vars instead.",
-            DeprecationWarning,
-        )
-        return self.value_vars
-
-    @property
-    def value_vars(self):
         """List of unobserved random variables used as inputs to the model's
         log-likelihood (which excludes deterministics).
         """
@@ -824,43 +807,8 @@ def independent_vars(self):
 
     @property
     def test_point(self):
-        warnings.warn(
-            "`Model.test_point` has been deprecated. Use `Model.initial_point` instead.",
-            DeprecationWarning,
-        )
-        return self.initial_point
-
-    @property
-    def initial_point(self):
-        points = []
-        for rv_var in self.free_RVs:
-            value_var = rv_var.tag.value_var
-            var_value = getattr(value_var.tag, "test_value", None)
-
-            if var_value is None:
-
-                rv_var_value = getattr(rv_var.tag, "test_value", None)
-
-                if rv_var_value is None:
-                    try:
-                        rv_var_value = rv_var.eval()
-                    except Exception:
-                        raise Exception(f"Couldn't generate an initial value for {rv_var}")
-
-                transform = getattr(value_var.tag, "transform", None)
-
-                if transform:
-                    try:
-                        rv_var_value = transform.forward(rv_var, rv_var_value).eval()
-                    except Exception:
-                        raise Exception(f"Couldn't generate an initial value for {rv_var}")
-
-                var_value = rv_var_value
-                value_var.tag.test_value = var_value
-
-            points.append((value_var, var_value))
-
-        return Point(points, model=self)
+        """Test point used to check that the model doesn't generate errors"""
+        return Point(((var.tag.value_var, var.tag.test_value) for var in self.free_RVs), model=self)
 
     @property
     def disc_vars(self):
@@ -902,7 +850,7 @@ def add_coords(self, coords):
             else:
                 self.coords[name] = coords[name]
 
-    def register_rv(self, rv_var, name, data=None, total_size=None, dims=None, transform=UNSET):
+    def register_rv(self, rv_var, name, data=None, total_size=None, dims=None):
         """Register an (un)observed random variable with the model.
 
         Parameters
@@ -926,8 +874,47 @@ def register_rv(self, rv_var, name, data=None, total_size=None, dims=None, trans
         rv_var.tag.total_size = total_size
 
         if data is None:
+            # Create a `TensorVariable` that will be used as the random
+            # variable's "value" in log-likelihood graphs.
+            #
+            # In general, we'll call this type of variable the "value" variable.
+            #
+            # In all other cases, the role of the value variable is taken by
+            # observed data. That's why value variables are only referenced in
+            # this branch of the conditional.
+            value_var = rv_var.clone()
+            value_var.name = rv_var.name
+            rv_var.tag.value_var = value_var
+
             self.free_RVs.append(rv_var)
-            self.create_value_var(rv_var, transform)
+
+            transform = rv_var.tag.transform
+            value_var.tag.transform = None
+
+            if transform is not None:
+                self.deterministics.append(rv_var)
+
+        elif isinstance(data, dict):
+
+            # TODO: How exactly does this dictionary map to `rv_var`?
+
+            # obs_rvs = {name: make_obs_var(rv_var, d, name, self) for name, d in data.items()}
+            # rv_var.tag.data = obs_rvs
+            #
+            # missing_values = [
+            #     datum.missing_values for datum in data.values() if datum.missing_values is not None
+            # ]
+            # rv_var.tag.missing_values = missing_values
+            #
+            # self.observed_RVs.append(rv_var)
+            #
+            # if missing_values:
+            #     self.free_RVs += rv_var.tag.missing_values
+            #     self.missing_values += rv_var.tag.missing_values
+            #     for v in rv_var.tag.missing_values:
+            #         self.named_vars[v.name] = v
+
+            raise NotImplementedError()
         else:
             if (
                 isinstance(data, Variable)
@@ -938,51 +925,20 @@ def register_rv(self, rv_var, name, data=None, total_size=None, dims=None, trans
 
             data = pandas_to_array(data)
 
-            rv_var = make_obs_var(rv_var, data)
+            rv_var = make_obs_var(rv_var, data, name, self)
+            rv_var.tag.data = data
 
-            self.create_value_var(rv_var, transform)
+            self.observed_RVs.append(rv_var)
 
-            if hasattr(rv_var.tag, "observations"):
-                self.observed_RVs.append(rv_var)
+            if rv_var.tag.missing_values:
+                self.free_RVs.append(rv_var.tag.missing_values)
+                self.missing_values.append(rv_var.tag.missing_values)
+                self.named_vars[rv_var.tag.missing_values.name] = rv_var.tag.missing_values
 
         self.add_random_variable(rv_var, dims)
 
         return rv_var
 
-    def create_value_var(self, rv_var: TensorVariable, transform: Any) -> TensorVariable:
-        """Create a ``TensorVariable`` that will be used as the random
-        variable's "value" in log-likelihood graphs.
-
-        In general, we'll call this type of variable the "value" variable.
-
-        In all other cases, the role of the value variable is taken by
-        observed data. That's why value variables are only referenced in
-        this branch of the conditional.
-
-        """
-        value_var = rv_var.type()
-
-        if aesara.config.compute_test_value != "off":
-            value_var.tag.test_value = rv_var.tag.test_value
-
-        value_var.name = rv_var.name
-
-        rv_var.tag.value_var = value_var
-
-        # Make the value variable a transformed value variable,
-        # if there's an applicable transform
-        if transform is UNSET:
-            transform = logp_transform(rv_var.owner.op)
-
-        if transform is not None:
-            value_var.tag.transform = transform
-            value_var.name = f"{value_var.name}_{transform.name}__"
-            if aesara.config.compute_test_value != "off":
-                value_var.tag.test_value = transform.forward(rv_var, value_var).tag.test_value
-            self.named_vars[value_var.name] = value_var
-
-        return value_var
-
     def add_random_variable(self, var, dims=None):
         """Add a random variable to the named variables of the model."""
         if self.named_vars.tree_contains(var.name):
@@ -1137,11 +1093,9 @@ def flatten(self, vars=None, order=None, inputvar=None):
         flat_view
         """
         if vars is None:
-            vars = self.value_vars
-        if order is not None:
-            var_map = {v.name: v for v in vars}
-            vars = [var_map[n] for n in order]
-
+            vars = self.vars
+        if order is None:
+            order = ArrayOrdering(vars)
         if inputvar is None:
             inputvar = at.vector("flat_view", dtype=aesara.config.floatX)
             if aesara.config.compute_test_value != "off":
@@ -1264,10 +1218,7 @@ def point_logps(self, point=None, round_vals=2):
 
         return Series(
             {
-                rv.name: np.round(
-                    self.fn(logpt_sum(rv, getattr(rv.tag, "observations", None)))(point),
-                    round_vals,
-                )
+                rv.name: np.round(self.fn(logpt_sum(rv))(test_point), round_vals)
                 for rv in self.basic_RVs
             },
             name="Log-probability of test_point",
@@ -1452,7 +1403,84 @@ def __call__(self, *args, **kwargs):
 compilef = fastfn
 
 
-def make_obs_var(rv_var: TensorVariable, data: Union[np.ndarray]) -> TensorVariable:
+class FreeRV(Factor, PyMC3Variable):
+    """Unobserved random variable that a model is specified in terms of."""
+
+    dshape = None  # type: Tuple[int, ...]
+    size = None  # type: int
+    distribution = None  # type: Optional[Distribution]
+    model = None  # type: Optional[Model]
+
+    def __init__(
+        self,
+        type=None,
+        owner=None,
+        index=None,
+        name=None,
+        distribution=None,
+        total_size=None,
+        model=None,
+    ):
+        """
+        Parameters
+        ----------
+        type: aesara type (optional)
+        owner: aesara owner (optional)
+        name: str
+        distribution: Distribution
+        model: Model
+        total_size: scalar Tensor (optional)
+            needed for upscaling logp
+        """
+        if type is None:
+            type = distribution.type
+        super().__init__(type, owner, index, name)
+
+        if distribution is not None:
+            self.dshape = tuple(distribution.shape)
+            self.dsize = int(np.prod(distribution.shape))
+            self.distribution = distribution
+            self.tag.test_value = (
+                np.ones(distribution.shape, distribution.dtype) * distribution.default()
+            )
+            self.logp_elemwiset = distribution.logp(self)
+            # The logp might need scaling in minibatches.
+            # This is done in `Factor`.
+            self.logp_sum_unscaledt = distribution.logp_sum(self)
+            self.logp_nojac_unscaledt = distribution.logp_nojac(self)
+            self.total_size = total_size
+            self.model = model
+            self.scaling = _get_scaling(total_size, self.shape, self.ndim)
+
+            incorporate_methods(
+                source=distribution,
+                destination=self,
+                methods=["random"],
+                wrapper=InstanceMethod,
+            )
+
+    @property
+    def init_value(self):
+        """Convenience attribute to return tag.test_value"""
+        return self.tag.test_value
+
+    rv_var = change_rv_size(rv_var, new_size)
+
+    if aesara.config.compute_test_value != "off":
+        test_value = getattr(rv_var.tag, "test_value", None)
+
+        if test_value is not None:
+            # We try to reuse the old test value
+            rv_var.tag.test_value = np.broadcast_to(test_value, rv_var.tag.test_value.shape)
+        else:
+            rv_var.tag.test_value = data
+
+    mask = getattr(data, "mask", None)
+    if mask is not None:
+
+def make_obs_var(
+    rv_var: TensorVariable, data: Union[np.ndarray], name: str, model: Model
+) -> TensorVariable:
     """Create a `TensorVariable` for an observed random variable.
 
     Parameters
@@ -1461,13 +1489,16 @@ def make_obs_var(rv_var: TensorVariable, data: Union[np.ndarray]) -> TensorVaria
         The random variable that is observed.
     data: ndarray
         The observed data.
+    name: str
+        The name of the random variable.
+    model: Model
+        The model object.
 
     Returns
     =======
     The new observed random variable
 
     """
-    name = rv_var.name
     data = pandas_to_array(data).astype(rv_var.dtype)
 
     # The shapes of the observed random variable and its data might not
@@ -1484,25 +1515,17 @@ def make_obs_var(rv_var: TensorVariable, data: Union[np.ndarray]) -> TensorVaria
     else:
         new_size = data.shape
 
+    test_value = getattr(rv_var.tag, "test_value", None)
+
     rv_var = change_rv_size(rv_var, new_size)
 
-    if aesara.config.compute_test_value != "off":
-        test_value = getattr(rv_var.tag, "test_value", None)
-
-        if test_value is not None:
-            # We try to reuse the old test value
-            rv_var.tag.test_value = np.broadcast_to(test_value, rv_var.tag.test_value.shape)
-        else:
-            rv_var.tag.test_value = data
+    if aesara.config.compute_test_value != "off" and test_value is not None:
+        # We try to reuse the old test value
+        rv_var.tag.test_value = np.broadcast_to(test_value, rv_var.tag.test_value.shape)
 
+    missing_values = None
     mask = getattr(data, "mask", None)
     if mask is not None:
-
-        if mask.all():
-            # If there are no observed values, this variable isn't really
-            # observed.
-            return rv_var
-
         impute_message = (
             f"Data in {rv_var} contains missing values and"
             " will be automatically imputed from the"
@@ -1510,20 +1533,178 @@ def make_obs_var(rv_var: TensorVariable, data: Union[np.ndarray]) -> TensorVaria
         )
         warnings.warn(impute_message, ImputationWarning)
 
-        comp_data = at.as_tensor_variable(data.compressed())
-        data = at.as_tensor_variable(data)
-        data.tag.mask = mask
+        missing_values = rv_var[mask]
+        constant = aet.as_tensor_variable(data.filled())
+        data = aet.set_subtensor(constant[mask.nonzero()], missing_values)
+
+        # Now, we need log-likelihood-space terms for these missing values
+        value_var = rv_var.clone()
+        value_var.name = f"{rv_var.name}_missing"
+        rv_var.tag.value_var = value_var
 
-        rv_var = at.set_subtensor(rv_var[~mask], comp_data)
-        rv_var.name = name
     elif sps.issparse(data):
         data = sparse.basic.as_sparse(data, name=name)
     else:
-        data = at.as_tensor_variable(data, name=name)
+        data = aet.as_tensor_variable(data, name=name)
+
+    rv_obs = observed(rv_var, data)
+    rv_obs.tag.missing_values = missing_values
+
+    rv_obs.name = name
+
+    return rv_obs
 
     rv_var.tag.observations = data
 
-    return rv_var
+class ObservedRV(Factor, PyMC3Variable):
+    """Observed random variable that a model is specified in terms of.
+    Potentially partially observed.
+    """
+
+    def __init__(
+        self,
+        type=None,
+        owner=None,
+        index=None,
+        name=None,
+        data=None,
+        distribution=None,
+        total_size=None,
+        model=None,
+    ):
+        """
+        Parameters
+        ----------
+        type: aesara type (optional)
+        owner: aesara owner (optional)
+        name: str
+        distribution: Distribution
+        model: Model
+        total_size: scalar Tensor (optional)
+            needed for upscaling logp
+        """
+
+        if hasattr(data, "type") and isinstance(data.type, TensorType):
+            type = data.type
+
+        if type is None:
+            data = pandas_to_array(data)
+            if isinstance(data, Variable):
+                type = data.type
+            else:
+                type = TensorType(distribution.dtype, [s == 1 for s in data.shape])
+
+        self.observations = data
+
+        super().__init__(type, owner, index, name)
+
+        if distribution is not None:
+            data = aet.as_tensor(data, name, model, distribution)
+
+            self.missing_values = data.missing_values
+            self.logp_elemwiset = distribution.logp(data)
+            # The logp might need scaling in minibatches.
+            # This is done in `Factor`.
+            self.logp_sum_unscaledt = distribution.logp_sum(data)
+            self.logp_nojac_unscaledt = distribution.logp_nojac(data)
+            self.total_size = total_size
+            self.model = model
+            self.distribution = distribution
+
+            # make this RV a view on the combined missing/nonmissing array
+            Apply(aesara.compile.view_op, inputs=[data], outputs=[self])
+            self.tag.test_value = aesara.compile.view_op(data).tag.test_value.astype(self.dtype)
+            self.scaling = _get_scaling(total_size, data.shape, data.ndim)
+
+    @property
+    def init_value(self):
+        """Convenience attribute to return tag.test_value"""
+        return self.tag.test_value
+
+
+class MultiObservedRV(Factor):
+    """Observed random variable that a model is specified in terms of.
+    Potentially partially observed.
+    """
+
+    def __init__(self, name, data, distribution, total_size=None, model=None):
+        """
+        Parameters
+        ----------
+        type: aesara type (optional)
+        owner: aesara owner (optional)
+        name: str
+        distribution: Distribution
+        model: Model
+        total_size: scalar Tensor (optional)
+            needed for upscaling logp
+        """
+        self.name = name
+        self.data = {
+            name: aet.as_tensor(data, name, model, distribution) for name, data in data.items()
+        }
+
+        self.missing_values = [
+            datum.missing_values for datum in self.data.values() if datum.missing_values is not None
+        ]
+        self.logp_elemwiset = distribution.logp(**self.data)
+        # The logp might need scaling in minibatches.
+        # This is done in `Factor`.
+        self.logp_sum_unscaledt = distribution.logp_sum(**self.data)
+        self.logp_nojac_unscaledt = distribution.logp_nojac(**self.data)
+        self.total_size = total_size
+        self.model = model
+        self.distribution = distribution
+        self.scaling = _get_scaling(total_size, self.logp_elemwiset.shape, self.logp_elemwiset.ndim)
+
+    # Make hashable by id for draw_values
+    def __hash__(self):
+        return id(self)
+
+    def __eq__(self, other):
+        "Use object identity for MultiObservedRV equality."
+        # This is likely a Bad Thing, but changing it would break a lot of code.
+        return self is other
+
+    def __ne__(self, other):
+        return not self == other
+
+
+def _walk_up_rv(rv, formatting="plain"):
+    """Walk up aesara graph to get inputs for deterministic RV."""
+    all_rvs = []
+    parents = list(itertools.chain(*[j.inputs for j in rv.get_parents()]))
+    if parents:
+        for parent in parents:
+            all_rvs.extend(_walk_up_rv(parent, formatting=formatting))
+    else:
+        name = rv.name if rv.name else "Constant"
+        fmt = r"\text{{{name}}}" if "latex" in formatting else "{name}"
+        all_rvs.append(fmt.format(name=name))
+    return all_rvs
+
+
+class DeterministicWrapper(TensorVariable):
+    def _str_repr(self, formatting="plain"):
+        if "latex" in formatting:
+            if formatting == "latex_with_params":
+                return r"$\text{{{name}}} \sim \text{{Deterministic}}({args})$".format(
+                    name=self.name, args=r",~".join(_walk_up_rv(self, formatting=formatting))
+                )
+            return fr"$\text{{{self.name}}} \sim \text{{Deterministic}}$"
+        else:
+            if formatting == "plain_with_params":
+                args = ", ".join(_walk_up_rv(self, formatting=formatting))
+                return f"{self.name} ~ Deterministic({args})"
+            return f"{self.name} ~ Deterministic"
+
+    def _repr_latex_(self, *, formatting="latex_with_params", **kwargs):
+        return self._str_repr(formatting=formatting)
+
+    __latex__ = _repr_latex_
+
+    def __str__(self):
+        return self._str_repr(formatting="plain")
 
 
 def Deterministic(name, var, model=None, dims=None):
@@ -1561,6 +1742,90 @@ def Potential(name, var, model=None):
     model = modelcontext(model)
     var.name = model.name_for(name)
     var.tag.scaling = None
+    var.tag.transform = None
     model.potentials.append(var)
     model.add_random_variable(var)
     return var
+
+
+class TransformedRV(PyMC3Variable):
+    """
+    Parameters
+    ----------
+
+    type: aesara type (optional)
+    owner: aesara owner (optional)
+    name: str
+    distribution: Distribution
+    model: Model
+    total_size: scalar Tensor (optional)
+        needed for upscaling logp
+    """
+
+    def __init__(
+        self,
+        type=None,
+        owner=None,
+        index=None,
+        name=None,
+        distribution=None,
+        model=None,
+        transform=None,
+        total_size=None,
+    ):
+        if type is None:
+            type = distribution.type
+        super().__init__(type, owner, index, name)
+
+        self.transformation = transform
+
+        if distribution is not None:
+            self.model = model
+            self.distribution = distribution
+            self.dshape = tuple(distribution.shape)
+            self.dsize = int(np.prod(distribution.shape))
+
+            transformed_name = get_transformed_name(name, transform)
+
+            self.transformed = model.Var(
+                transformed_name, transform.apply(distribution), total_size=total_size
+            )
+
+            normalRV = transform.backward(self.transformed)
+
+            Apply(aesara.compile.view_op, inputs=[normalRV], outputs=[self])
+            self.tag.test_value = normalRV.tag.test_value
+            self.scaling = _get_scaling(total_size, self.shape, self.ndim)
+            incorporate_methods(
+                source=distribution,
+                destination=self,
+                methods=["random"],
+                wrapper=InstanceMethod,
+            )
+
+    @property
+    def init_value(self):
+        """Convenience attribute to return tag.test_value"""
+        return self.tag.test_value
+
+
+def as_iterargs(data):
+    if isinstance(data, tuple):
+        return data
+    else:
+        return [data]
+
+
+def all_continuous(vars):
+    """Check that vars not include discrete variables or BART variables, excepting ObservedRVs."""
+
+    vars_ = [var for var in vars if not (var.owner and isinstance(var.owner.op, Observed))]
+    if any(
+        [
+            (var.dtype in pm.discrete_types or (var.owner and isinstance(var.owner.op, pm.BART)))
+            for var in vars_
+        ]
+    ):
+        return False
+    else:
+        return True
diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index 4f9948f746..46a434c22f 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -37,8 +37,7 @@
 
 import pymc3 as pm
 
-from pymc3.aesaraf import change_rv_size, inputvars, walk_model
-from pymc3.backends.arviz import _DefaultTrace
+from pymc3.aesaraf import inputvars
 from pymc3.backends.base import BaseTrace, MultiTrace
 from pymc3.backends.ndarray import NDArray
 from pymc3.blocking import DictToArrayBijection
@@ -201,8 +200,8 @@ def assign_step_methods(model, step=None, methods=STEP_METHODS, step_kwargs=None
             has_gradient = var.dtype not in discrete_types
             if has_gradient:
                 try:
-                    tg.grad(model.logpt, var)
-                except (NotImplementedError, tg.NullTypeGradError):
+                    tg.grad(model.logpt, var.tag.value_var)
+                except (AttributeError, NotImplementedError, tg.NullTypeGradError):
                     has_gradient = False
             # select the best method
             rv_var = model.values_to_rvs[var]
@@ -659,7 +658,9 @@ def sample(
 
     idata = None
     if compute_convergence_checks or return_inferencedata:
-        ikwargs = dict(model=model, save_warmup=not discard_tuned_samples)
+        # XXX: Arviz `log_likelihood` calculations need to be disabled until
+        # it's updated to work with v4.
+        ikwargs = dict(model=model, save_warmup=not discard_tuned_samples, log_likelihood=False)
         if idata_kwargs:
             ikwargs.update(idata_kwargs)
         idata = pm.to_inference_data(trace, **ikwargs)
@@ -1962,13 +1963,12 @@ def sample_prior_predictive(
         vars_ = set(var_names)
 
     if random_seed is not None:
-        # np.random.seed(random_seed)
-        model.default_rng.get_value(borrow=True).seed(random_seed)
+        np.random.seed(random_seed)
 
     names = get_default_varnames(vars_, include_transformed=False)
 
     vars_to_sample = [model[name] for name in names]
-    inputs = [i for i in inputvars(vars_to_sample) if not isinstance(i, SharedVariable)]
+    inputs = [i for i in inputvars(vars_to_sample)]
     sampler_fn = aesara.function(
         inputs,
         vars_to_sample,
diff --git a/pymc3/step_methods/gibbs.py b/pymc3/step_methods/gibbs.py
index 14fb6eaa18..cde14c9916 100644
--- a/pymc3/step_methods/gibbs.py
+++ b/pymc3/step_methods/gibbs.py
@@ -81,7 +81,7 @@ def elemwise_logp(model, var):
         v_logp = logpt(v)
         if var in graph_inputs([v_logp]):
             terms.append(v_logp)
-    return model.fn(at.add(*terms))
+    return model.fn(add(*terms))
 
 
 def categorical(prob, shape):
diff --git a/pymc3/step_methods/hmc/base_hmc.py b/pymc3/step_methods/hmc/base_hmc.py
index 89f74ad07e..21a93c15bf 100644
--- a/pymc3/step_methods/hmc/base_hmc.py
+++ b/pymc3/step_methods/hmc/base_hmc.py
@@ -85,8 +85,6 @@ def __init__(
         if vars is None:
             vars = self._model.cont_vars
 
-        # vars = inputvars(vars)
-
         super().__init__(vars, blocked=blocked, model=self._model, dtype=dtype, **aesara_kwargs)
 
         self.adapt_step_size = adapt_step_size
@@ -95,9 +93,9 @@ def __init__(
 
         # We're using the initial/test point to determine the (initial) step
         # size.
-        # XXX: If the dimensions of these terms change, the step size
+        # TODO: If the dimensions of these terms change, the step size
         # dimension-scaling should change as well, no?
-        test_point = self._model.initial_point
+        test_point = self._model.test_point
         continuous_vars = [test_point[v.name] for v in self._model.cont_vars]
         size = sum(v.size for v in continuous_vars)
 
diff --git a/pymc3/tests/sampler_fixtures.py b/pymc3/tests/sampler_fixtures.py
index 814ed616b7..6fb831d159 100644
--- a/pymc3/tests/sampler_fixtures.py
+++ b/pymc3/tests/sampler_fixtures.py
@@ -151,7 +151,7 @@ def setup_class(cls):
             )
         cls.samples = {}
         for var in cls.model.unobserved_RVs:
-            cls.samples[get_var_name(var)] = cls.trace.get_values(var, burn=cls.burn)
+            cls.samples[get_var_name(var)] = cls.trace.get_values(var.tag.value_var, burn=cls.burn)
 
     def test_neff(self):
         if hasattr(self, "min_n_eff"):
diff --git a/pymc3/tests/test_model.py b/pymc3/tests/test_model.py
index a38c827b76..2e7c8b18aa 100644
--- a/pymc3/tests/test_model.py
+++ b/pymc3/tests/test_model.py
@@ -32,10 +32,8 @@
 import pymc3 as pm
 
 from pymc3 import Deterministic, Potential
-from pymc3.blocking import DictToArrayBijection, RaveledVars
-from pymc3.distributions import Normal, logpt_sum, transforms
-from pymc3.model import Point, ValueGradFunction
-from pymc3.tests.helpers import SeededTest
+from pymc3.distributions import Normal, transforms
+from pymc3.model import ValueGradFunction
 
 
 class NewModel(pm.Model):
@@ -204,7 +202,15 @@ def test_empty_observed():
     data.values[:] = np.nan
     with pm.Model():
         a = pm.Normal("a", observed=data)
-        assert not hasattr(a.tag, "observations")
+        # The masked observations are replaced by elements of the RV `a`,
+        # which means that they should all have the same sample test values
+        a_data = a.owner.inputs[1]
+        npt.assert_allclose(a.tag.test_value, a_data.tag.test_value)
+
+        # Let's try this again with another distribution
+        b = pm.Gamma("b", alpha=1, beta=1, observed=data)
+        b_data = b.owner.inputs[1]
+        npt.assert_allclose(b.tag.test_value, b_data.tag.test_value)
 
 
 class TestValueGradFunction(unittest.TestCase):
@@ -273,7 +279,21 @@ def test_grad(self):
         assert val == 21
         npt.assert_allclose(grad, [5, 5, 5, 1, 1, 1, 1, 1, 1])
 
-    @pytest.mark.xfail(reason="Lognormal not refactored for v4")
+    def test_bij(self):
+        self.f_grad.set_extra_values({"extra1": 5})
+        array = np.ones(self.f_grad.size, dtype=self.f_grad.dtype)
+        point = self.f_grad.array_to_dict(array)
+        assert len(point) == 2
+        npt.assert_allclose(point["val1"], 1)
+        npt.assert_allclose(point["val2"], 1)
+
+        array2 = self.f_grad.dict_to_array(point)
+        npt.assert_allclose(array2, array)
+        point_ = self.f_grad.array_to_full_dict(array)
+        assert len(point_) == 3
+        assert point_["extra1"] == 5
+
+    @pytest.mark.xfail(reason="Missing distributions")
     def test_edge_case(self):
         # Edge case discovered in #2948
         ndim = 3
@@ -292,8 +312,9 @@ def test_edge_case(self):
         assert dlogp.size == 4
         npt.assert_allclose(dlogp, 0.0, atol=1e-5)
 
-    def test_missing_data(self):
-        # Originally from a case described in #3122
+    @pytest.mark.xfail(reason="Missing distributions")
+    def test_tensor_type_conversion(self):
+        # case described in #3122
         X = np.random.binomial(1, 0.5, 10)
         X[0] = -1  # masked a single value
         X = np.ma.masked_values(X, value=-1)
@@ -306,14 +327,8 @@ def test_missing_data(self):
 
         m.default_rng.get_value(borrow=True).seed(102)
 
-        # The gradient should have random values as inputs, so its value should
-        # change every time we evaluate it at the same point
-        #
-        # TODO: We could probably use a better test than this.
-        res = [gf(DictToArrayBijection.map(Point(m.test_point, model=m))) for i in range(20)]
-        assert np.var(res) > 0.0
-
-    def test_aesara_switch_broadcast_edge_cases_1(self):
+    @pytest.mark.xfail(reason="Missing distributions")
+    def test_aesara_switch_broadcast_edge_cases(self):
         # Tests against two subtle issues related to a previous bug in Theano
         # where `tt.switch` would not always broadcast tensors with single
         # values https://github.com/pymc-devs/aesara/issues/270
@@ -345,7 +360,7 @@ def test_aesara_switch_broadcast_edge_cases_2(self):
         npt.assert_allclose(m.dlogp([mu])({"mu": 0}), 2.499424682024436, rtol=1e-5)
 
 
-@pytest.mark.xfail(reason="DensityDist not refactored for v4")
+@pytest.mark.xfail(reason="DensityDist not supported")
 def test_multiple_observed_rv():
     "Test previously buggy multi-observed RV comparison code."
     y1_data = np.random.randn(10)
@@ -361,6 +376,7 @@ def test_multiple_observed_rv():
     assert not model["x"] in model.value_vars
 
 
+@pytest.mark.xfail(reason="Functions depend on deprecated dshape/dsize")
 def test_tempered_logp_dlogp():
     with pm.Model() as model:
         pm.Normal("x")
diff --git a/pymc3/tests/test_model_helpers.py b/pymc3/tests/test_model_helpers.py
new file mode 100644
index 0000000000..00a2d0b477
--- /dev/null
+++ b/pymc3/tests/test_model_helpers.py
@@ -0,0 +1,157 @@
+#   Copyright 2020 The PyMC Developers
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+import aesara
+import aesara.sparse as sparse
+import aesara.tensor as aet
+import numpy as np
+import numpy.ma as ma
+import numpy.testing as npt
+import pandas as pd
+import pytest
+import scipy.sparse as sps
+
+from aesara.graph.basic import Variable
+from aesara.tensor.var import TensorConstant, TensorVariable
+
+import pymc3 as pm
+
+
+class TestHelperFunc:
+    @pytest.mark.parametrize("input_dtype", ["int32", "int64", "float32", "float64"])
+    def test_pandas_to_array(self, input_dtype):
+        """
+        Ensure that pandas_to_array returns the dense array, masked array,
+        graph variable, TensorVariable, or sparse matrix as appropriate.
+        """
+        # Create the various inputs to the function
+        sparse_input = sps.csr_matrix(np.eye(3)).astype(input_dtype)
+        dense_input = np.arange(9).reshape((3, 3)).astype(input_dtype)
+
+        input_name = "input_variable"
+        aesara_graph_input = aet.as_tensor(dense_input, name=input_name)
+        pandas_input = pd.DataFrame(dense_input)
+
+        # All the even numbers are replaced with NaN
+        missing_numpy_input = np.array([[np.nan, 1, np.nan], [3, np.nan, 5], [np.nan, 7, np.nan]])
+        missing_pandas_input = pd.DataFrame(missing_numpy_input)
+        masked_array_input = ma.array(dense_input, mask=(np.mod(dense_input, 2) == 0))
+
+        # Create a generator object. Apparently the generator object needs to
+        # yield numpy arrays.
+        square_generator = (np.array([i ** 2], dtype=int) for i in range(100))
+
+        # Alias the function to be tested
+        func = pm.model.pandas_to_array
+
+        #####
+        # Perform the various tests
+        #####
+        # Check function behavior with dense arrays and pandas dataframes
+        # without missing values
+        for input_value in [dense_input, pandas_input]:
+            func_output = func(input_value)
+            assert isinstance(func_output, np.ndarray)
+            assert func_output.shape == input_value.shape
+            npt.assert_allclose(func_output, dense_input)
+
+        # Check function behavior with sparse matrix inputs
+        sparse_output = func(sparse_input)
+        assert sps.issparse(sparse_output)
+        assert sparse_output.shape == sparse_input.shape
+        npt.assert_allclose(sparse_output.toarray(), sparse_input.toarray())
+
+        # Check function behavior when using masked array inputs and pandas
+        # objects with missing data
+        for input_value in [missing_numpy_input, masked_array_input, missing_pandas_input]:
+            func_output = func(input_value)
+            assert isinstance(func_output, ma.core.MaskedArray)
+            assert func_output.shape == input_value.shape
+            npt.assert_allclose(func_output, masked_array_input)
+
+        # Check function behavior with Aesara graph variable
+        aesara_output = func(aesara_graph_input)
+        assert isinstance(aesara_output, Variable)
+        npt.assert_allclose(aesara_output.eval(), aesara_graph_input.eval())
+        intX = pm.aesaraf._conversion_map[aesara.config.floatX]
+        if dense_input.dtype == intX or dense_input.dtype == aesara.config.floatX:
+            assert aesara_output.owner is None  # func should not have added new nodes
+            assert aesara_output.name == input_name
+        else:
+            assert aesara_output.owner is not None  # func should have casted
+            assert aesara_output.owner.inputs[0].name == input_name
+
+        if "float" in input_dtype:
+            assert aesara_output.dtype == aesara.config.floatX
+        else:
+            assert aesara_output.dtype == intX
+
+        # Check function behavior with generator data
+        generator_output = func(square_generator)
+
+        # Output is wrapped with `pm.floatX`, and this unwraps
+        wrapped = generator_output.owner.inputs[0]
+        # Make sure the returned object has .set_gen and .set_default methods
+        assert hasattr(wrapped, "set_gen")
+        assert hasattr(wrapped, "set_default")
+        # Make sure the returned object is a Aesara TensorVariable
+        assert isinstance(wrapped, TensorVariable)
+
+    def test_as_tensor(self):
+        """
+        Check returned values for `data` given known inputs to `as_tensor()`.
+
+        Note that ndarrays should return a TensorConstant and sparse inputs
+        should return a Sparse Aesara object.
+        """
+        # Create the various inputs to the function
+        input_name = "testing_inputs"
+        sparse_input = sps.csr_matrix(np.eye(3))
+        dense_input = np.arange(9).reshape((3, 3))
+        masked_array_input = ma.array(dense_input, mask=(np.mod(dense_input, 2) == 0))
+
+        # Create a fake model and fake distribution to be used for the test
+        fake_model = pm.Model()
+        with fake_model:
+            fake_distribution = pm.Normal.dist(mu=0, sigma=1)
+            # Create the testval attribute simply for the sake of model testing
+            fake_distribution.testval = None
+
+        # Alias the function to be tested
+        func = pm.model.make_obs_var
+
+        # Check function behavior using the various inputs
+        dense_output = func(dense_input, input_name, fake_model, fake_distribution)
+        sparse_output = func(sparse_input, input_name, fake_model, fake_distribution)
+        masked_output = func(masked_array_input, input_name, fake_model, fake_distribution)
+
+        # Ensure that the missing values are appropriately set to None
+        for func_output in [dense_output, sparse_output]:
+            assert func_output.missing_values is None
+
+        # Ensure that the Aesara variable names are correctly set.
+        # Note that the output for masked inputs do not have their names set
+        # to the passed value.
+        for func_output in [dense_output, sparse_output]:
+            assert func_output.name == input_name
+
+        # Ensure the that returned functions are all of the correct type
+        assert isinstance(dense_output, TensorConstant)
+        assert sparse.basic._is_sparse_variable(sparse_output)
+
+        # Masked output is something weird. Just ensure it has missing values
+        # self.assertIsInstance(masked_output, TensorConstant)
+        assert masked_output.missing_values is not None
+
+        return None
diff --git a/pymc3/tuning/starting.py b/pymc3/tuning/starting.py
index be1da625a2..0b70716fc5 100644
--- a/pymc3/tuning/starting.py
+++ b/pymc3/tuning/starting.py
@@ -154,15 +154,10 @@ def dlogp_func(x):
             cost_func.progress.update(last_v)
             print()
 
-    mx0 = RaveledVars(mx0, x0.point_map_info)
-
     vars = get_default_varnames(
         [v.tag.value_var for v in model.unobserved_RVs], include_transformed
     )
-    mx = {
-        var.name: value
-        for var, value in zip(vars, model.fastfn(vars)(DictToArrayBijection.rmap(mx0)))
-    }
+    mx = {var.name: value for var, value in zip(vars, model.fastfn(vars)(bij.rmap(mx0)))}
 
     if return_raw:
         return mx, opt_result
diff --git a/pymc3/util.py b/pymc3/util.py
index d60f83caff..4db6041195 100644
--- a/pymc3/util.py
+++ b/pymc3/util.py
@@ -257,8 +257,62 @@ def get_repr_for_variable(variable, formatting="plain"):
 
 
 def get_var_name(var):
-    """Get an appropriate, plain variable name for a variable."""
-    return getattr(var, "name", str(var))
+    """Get an appropriate, plain variable name for a variable. Necessary
+    because we override aesara.tensor.var.TensorVariable.__str__ to give informative
+    string representations to our pymc3.PyMC3Variables, yet we want to use the
+    plain name as e.g. keys in dicts.
+    """
+    if isinstance(var, TensorVariable):
+        return super(TensorVariable, var).__str__()
+    else:
+        return str(var)
+
+
+def update_start_vals(a, b, model):
+    r"""Update a with b, without overwriting existing keys."""
+    a.update({k: v for k, v in b.items() if k not in a})
+
+
+def check_start_vals(start, model):
+    r"""Check that the starting values for MCMC do not cause the relevant log probability
+    to evaluate to something invalid (e.g. Inf or NaN)
+
+    Parameters
+    ----------
+    start : dict, or array of dict
+        Starting point in parameter space (or partial point)
+        Defaults to ``trace.point(-1))`` if there is a trace provided and model.test_point if not
+        (defaults to empty dict). Initialization methods for NUTS (see ``init`` keyword) can
+        overwrite the default.
+    model : Model object
+    Raises
+    ______
+    KeyError if the parameters provided by `start` do not agree with the parameters contained
+        within `model`
+    pymc3.exceptions.SamplingError if the evaluation of the parameters in `start` leads to an
+        invalid (i.e. non-finite) state
+    Returns
+    -------
+    None
+    """
+    start_points = [start] if isinstance(start, dict) else start
+    for elem in start_points:
+        if not set(elem.keys()).issubset(model.named_vars.keys()):
+            extra_keys = ", ".join(set(elem.keys()) - set(model.named_vars.keys()))
+            valid_keys = ", ".join(model.named_vars.keys())
+            raise KeyError(
+                "Some start parameters do not appear in the model!\n"
+                "Valid keys are: {}, but {} was supplied".format(valid_keys, extra_keys)
+            )
+
+        initial_eval = model.check_test_point(test_point=elem)
+
+        if not np.all(np.isfinite(initial_eval)):
+            raise SamplingError(
+                "Initial evaluation of model at starting point failed!\n"
+                "Starting values:\n{}\n\n"
+                "Initial evaluation results:\n{}".format(elem, str(initial_eval))
+            )
 
 
 def get_transformed(z):

From 02e25aa3ff48451ca6665786befb0fccea9f51c4 Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Fri, 29 Jan 2021 00:16:32 -0600
Subject: [PATCH 02/44] Remove shape dependencies from DictToArrayBijection

This commit changes `DictToArrayBijection` so that it returns a `RaveledVars`
datatype that contains the original raveled and concatenated vector along with
the information needed to revert it back to dictionay/variables form.

Simply put, the variables-to-single-vector mapping steps have been pushed away
from the model object and its symbolic terms and closer to the (sampling)
processes that produce and work with `ndarray` values for said terms.  In doing
so, we can operate under fewer unnecessarily strong assumptions (e.g. that the
shapes of each term are static and equal to the initial test points), and let
the sampling processes that require vector-only steps deal with any changes in
the mappings.
---
 pymc3/aesaraf.py                        | 23 +++----
 pymc3/blocking.py                       | 23 +++++--
 pymc3/distributions/discrete.py         |  2 +-
 pymc3/model.py                          | 40 +++++------
 pymc3/sampling.py                       | 24 ++++---
 pymc3/step_methods/arraystep.py         | 88 ++++++++++---------------
 pymc3/step_methods/hmc/base_hmc.py      |  4 +-
 pymc3/step_methods/hmc/quadpotential.py |  8 +--
 pymc3/step_methods/metropolis.py        |  5 +-
 pymc3/step_methods/mlda.py              |  5 +-
 pymc3/tests/test_distributions.py       |  2 +-
 pymc3/tests/test_model.py               | 19 +-----
 pymc3/tuning/starting.py                | 19 ++----
 pymc3/variational/opvi.py               |  3 +-
 14 files changed, 113 insertions(+), 152 deletions(-)

diff --git a/pymc3/aesaraf.py b/pymc3/aesaraf.py
index 54319005df..77b1c0ec7b 100644
--- a/pymc3/aesaraf.py
+++ b/pymc3/aesaraf.py
@@ -47,12 +47,8 @@
 from aesara.tensor.subtensor import AdvancedIncSubtensor, AdvancedIncSubtensor1
 from aesara.tensor.var import TensorVariable
 
-from pymc3.vartypes import continuous_types, int_types, isgenerator, typefilter
-
-PotentialShapeType = Union[
-    int, np.ndarray, Tuple[Union[int, Variable], ...], List[Union[int, Variable]], Variable
-]
-
+from pymc3.data import GeneratorAdapter
+from pymc3.vartypes import continuous_types, int_types, typefilter
 
 __all__ = [
     "gradient",
@@ -576,20 +572,17 @@ def join_nonshared_inputs(
         tensor_type = joined.type
         inarray = tensor_type("inarray")
     else:
-        if point is None:
-            raise ValueError("A point is required when `make_shared` is True")
-        joined_values = np.concatenate([point[var.name].ravel() for var in vars])
-        inarray = aesara.shared(joined_values, "inarray")
+        inarray = aesara.shared(joined.tag.test_value, "inarray")
 
-    if aesara.config.compute_test_value != "off":
-        inarray.tag.test_value = joined.tag.test_value
+    inarray.tag.test_value = joined.tag.test_value
 
     replace = {}
     last_idx = 0
     for var in vars:
-        shape = point[var.name].shape
-        arr_len = np.prod(shape, dtype=int)
-        replace[var] = reshape_t(inarray[last_idx : last_idx + arr_len], shape).astype(var.dtype)
+        arr_len = aet.prod(var.shape)
+        replace[var] = reshape_t(inarray[last_idx : last_idx + arr_len], var.shape).astype(
+            var.dtype
+        )
         last_idx += arr_len
 
     replace.update(shared)
diff --git a/pymc3/blocking.py b/pymc3/blocking.py
index 16bb59ec3e..ec75d81082 100644
--- a/pymc3/blocking.py
+++ b/pymc3/blocking.py
@@ -23,11 +23,26 @@
 
 import numpy as np
 
-__all__ = ["DictToArrayBijection"]
+__all__ = ["ArrayOrdering", "DictToArrayBijection"]
 
 # `point_map_info` is a tuple of tuples containing `(name, shape, dtype)` for
 # each of the raveled variables.
 RaveledVars = collections.namedtuple("RaveledVars", "data, point_map_info")
+VarMap = collections.namedtuple("VarMap", "var, slc, shp, dtyp")
+DataMap = collections.namedtuple("DataMap", "list_ind, slc, shp, dtype, name")
+
+
+class ArrayOrdering:
+    """
+
+            slc = slice(self.size, self.size + var.dsize)
+            varmap = VarMap(name, slc, var.dshape, var.dtype)
+            self.vmap.append(varmap)
+            self.by_name[name] = varmap
+            self.size += var.dsize
+
+    def __getitem__(self, key):
+        return self.by_name[key]
 
 
 class DictToArrayBijection:
@@ -41,11 +56,7 @@ class DictToArrayBijection:
     def map(var_dict: Dict[str, np.ndarray]) -> RaveledVars:
         """Map a dictionary of names and variables to a concatenated 1D array space."""
         vars_info = tuple((v, k, v.shape, v.dtype) for k, v in var_dict.items())
-        raveled_vars = [v[0].ravel() for v in vars_info]
-        if raveled_vars:
-            res = np.concatenate(raveled_vars)
-        else:
-            res = np.array([])
+        res = np.concatenate([v[0].ravel() for v in vars_info])
         return RaveledVars(res, tuple(v[1:] for v in vars_info))
 
     @staticmethod
diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py
index 0fa23ae6ce..5c941d58a6 100644
--- a/pymc3/distributions/discrete.py
+++ b/pymc3/distributions/discrete.py
@@ -1261,7 +1261,7 @@ def dist(cls, p, **kwargs):
 
 
 @_logp.register(CategoricalRV)
-def categorical_logp(op, value, p_, upper):
+def categorical_logp(op, value, p, upper):
     r"""
     Calculate log-probability of Categorical distribution at specified value.
 
diff --git a/pymc3/model.py b/pymc3/model.py
index 8d771afc68..1171ee29df 100644
--- a/pymc3/model.py
+++ b/pymc3/model.py
@@ -38,13 +38,13 @@
 import pymc3 as pm
 
 from pymc3.aesaraf import generator, gradient, hessian, inputvars
-from pymc3.blocking import ArrayOrdering, DictToArrayBijection
+from pymc3.blocking import DictToArrayBijection, RaveledVars
 from pymc3.data import GenTensorVariable, Minibatch
 from pymc3.distributions import _get_scaling, change_rv_size, logpt, logpt_sum
 from pymc3.exceptions import ImputationWarning
 from pymc3.math import flatten_list
-from pymc3.util import UNSET, WithMemoization, get_var_name, treedict, treelist
-from pymc3.vartypes import continuous_types, discrete_types, typefilter
+from pymc3.util import WithMemoization, get_transformed_name, get_var_name
+from pymc3.vartypes import continuous_types, discrete_types, isgenerator, typefilter
 
 __all__ = [
     "Model",
@@ -423,17 +423,11 @@ def __init__(
         self._extra_vars_shared = {}
         for var in extra_vars:
             shared = aesara.shared(var.tag.test_value, var.name + "_shared__")
-            # test TensorType compatibility
-            if hasattr(var.tag.test_value, "shape"):
-                testtype = TensorType(var.dtype, [s == 1 for s in var.tag.test_value.shape])
-
-                if testtype != shared.type:
-                    shared.type = testtype
             self._extra_vars_shared[var.name] = shared
             givens.append((var, shared))
 
         if compute_grads:
-            grads = grad(cost, grad_vars, disconnected_inputs="ignore")
+            grads = grad(cost, grad_vars)
             for grad_wrt, var in zip(grads, grad_vars):
                 grad_wrt.name = f"{var.name}_grad"
             outputs = [cost] + grads
@@ -654,19 +648,12 @@ def isroot(self):
         return self.parent is None
 
     @property
-    def ndim(self):
-        return sum(var.ndim for var in self.free_RVs)
-
-    @property
-    def logp_array(self):
-        return self.bijection.mapf(self.fastlogp)
+    def size(self):
+        return sum(self.test_point[n.name].size for n in self.free_RVs)
 
     @property
-    def dlogp_array(self):
-        logpt = self.logpt
-        vars = inputvars(logpt)
-        dlogp = self.fastfn(gradient(self.logpt, vars))
-        return self.bijection.mapf(dlogp)
+    def ndim(self):
+        return sum(var.ndim for var in self.free_RVs)
 
     def logp_dlogp_function(self, grad_vars=None, tempered=False, **kwargs):
         """Compile a aesara function that computes logp and gradient.
@@ -1094,8 +1081,10 @@ def flatten(self, vars=None, order=None, inputvar=None):
         """
         if vars is None:
             vars = self.vars
-        if order is None:
-            order = ArrayOrdering(vars)
+        if order is not None:
+            var_map = {v.name: v for v in vars}
+            vars = [var_map[n] for n in order]
+
         if inputvar is None:
             inputvar = at.vector("flat_view", dtype=aesara.config.floatX)
             if aesara.config.compute_test_value != "off":
@@ -1107,13 +1096,14 @@ def flatten(self, vars=None, order=None, inputvar=None):
         replacements = {}
         last_idx = 0
         for var in vars:
-            arr_len = at.prod(var.shape, dtype="int64")
+            arr_len = aet.prod(var.shape, dtype="int64")
             replacements[self.named_vars[var.name]] = (
                 inputvar[last_idx : (last_idx + arr_len)].reshape(var.shape).astype(var.dtype)
             )
             last_idx += arr_len
 
-        flat_view = FlatView(inputvar, replacements)
+        view = {vm.var: vm for vm in order.vmap}
+        flat_view = FlatView(inputvar, replacements, view)
 
         return flat_view
 
diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index 46a434c22f..ed4cb7838f 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -41,6 +41,8 @@
 from pymc3.backends.base import BaseTrace, MultiTrace
 from pymc3.backends.ndarray import NDArray
 from pymc3.blocking import DictToArrayBijection
+from pymc3.distributions.distribution import draw_values
+from pymc3.distributions.posterior_predictive import fast_sample_posterior_predictive
 from pymc3.exceptions import IncorrectArgumentsError, SamplingError
 from pymc3.model import Model, Point, modelcontext
 from pymc3.parallel_sampling import Draw, _cpu_count
@@ -2120,16 +2122,16 @@ def init_nuts(
         pm.callbacks.CheckParametersConvergence(tolerance=1e-2, diff="relative"),
     ]
 
-    apoint = DictToArrayBijection.map(model.initial_point)
+    apoint = DictToArrayBijection.map(model.test_point)
 
     if init == "adapt_diag":
-        start = [model.initial_point] * chains
+        start = [model.test_point] * chains
         mean = np.mean([apoint.data] * chains, axis=0)
         var = np.ones_like(mean)
         n = len(var)
         potential = quadpotential.QuadPotentialDiagAdapt(n, mean, var, 10)
     elif init == "jitter+adapt_diag":
-        start = _init_jitter(model, model.initial_point, chains, jitter_max_retries)
+        start = _init_jitter(model, chains, jitter_max_retries)
         mean = np.mean([DictToArrayBijection.map(vals).data for vals in start], axis=0)
         var = np.ones_like(mean)
         n = len(var)
@@ -2205,19 +2207,15 @@ def init_nuts(
         start = [start] * chains
         potential = quadpotential.QuadPotentialFull(cov)
     elif init == "adapt_full":
-        initial_point = model.initial_point
-        start = [initial_point] * chains
+        start = [model.test_point] * chains
         mean = np.mean([apoint.data] * chains, axis=0)
-        initial_point_model_size = sum(initial_point[n.name].size for n in model.value_vars)
-        cov = np.eye(initial_point_model_size)
-        potential = quadpotential.QuadPotentialFullAdapt(initial_point_model_size, mean, cov, 10)
+        cov = np.eye(model.size)
+        potential = quadpotential.QuadPotentialFullAdapt(model.size, mean, cov, 10)
     elif init == "jitter+adapt_full":
-        initial_point = model.initial_point
-        start = _init_jitter(model, initial_point, chains, jitter_max_retries)
+        start = _init_jitter(model, chains, jitter_max_retries)
         mean = np.mean([DictToArrayBijection.map(vals).data for vals in start], axis=0)
-        initial_point_model_size = sum(initial_point[n.name].size for n in model.value_vars)
-        cov = np.eye(initial_point_model_size)
-        potential = quadpotential.QuadPotentialFullAdapt(initial_point_model_size, mean, cov, 10)
+        cov = np.eye(model.size)
+        potential = quadpotential.QuadPotentialFullAdapt(model.size, mean, cov, 10)
     else:
         raise ValueError(f"Unknown initializer: {init}.")
 
diff --git a/pymc3/step_methods/arraystep.py b/pymc3/step_methods/arraystep.py
index bd02887cd8..3224380a9f 100644
--- a/pymc3/step_methods/arraystep.py
+++ b/pymc3/step_methods/arraystep.py
@@ -21,7 +21,7 @@
 from numpy.random import uniform
 
 from pymc3.blocking import DictToArrayBijection, RaveledVars
-from pymc3.model import modelcontext
+from pymc3.model import PyMC3Variable, modelcontext
 from pymc3.step_methods.compound import CompoundStep
 from pymc3.util import get_var_name
 
@@ -69,8 +69,8 @@ def __new__(cls, *args, **kwargs):
         else:  # Assume all model variables
             vars = model.value_vars
 
-        if not isinstance(vars, (tuple, list)):
-            vars = [vars]
+        # get the actual inputs from the vars
+        # vars = inputvars(vars)
 
         if len(vars) == 0:
             raise ValueError("No free random variables to sample.")
@@ -149,22 +149,13 @@ def step(self, point: Dict[str, np.ndarray]):
         step_res = self.astep(apoint, *inputs)
 
         if self.generates_stats:
-            apoint_new, stats = step_res
+            apoint, stats = self.astep(DictToArrayBijection.map(point), *inputs)
+            return DictToArrayBijection.rmap(apoint), stats
         else:
-            apoint_new = step_res
+            apoint = self.astep(DictToArrayBijection.map(point), *inputs)
+            return DictToArrayBijection.rmap(apoint)
 
-        if not isinstance(apoint_new, RaveledVars):
-            # We assume that the mapping has stayed the same
-            apoint_new = RaveledVars(apoint_new, apoint.point_map_info)
-
-        point_new = DictToArrayBijection.rmap(apoint_new)
-
-        if self.generates_stats:
-            return point_new, stats
-
-        return point_new
-
-    def astep(self, apoint: RaveledVars, point: Dict[str, np.ndarray]):
+    def astep(self, apoint, point):
         raise NotImplementedError()
 
 
@@ -190,42 +181,18 @@ def __init__(self, vars, shared, blocked=True):
 
     def step(self, point):
 
-        # Remove shared variables from the sample point
-        point_no_shared = point.copy()
-        for name, shared_var in self.shared.items():
-            shared_var.set_value(point[name])
-            if name in point_no_shared:
-                del point_no_shared[name]
-
-        q = DictToArrayBijection.map(point_no_shared)
-
-        step_res = self.astep(q)
-
         if self.generates_stats:
-            apoint, stats = step_res
+            apoint, stats = self.astep(DictToArrayBijection.map(point))
+            return DictToArrayBijection.rmap(apoint), stats
         else:
-            apoint = step_res
-
-        if not isinstance(apoint, RaveledVars):
-            # We assume that the mapping has stayed the same
-            apoint = RaveledVars(apoint, q.point_map_info)
-
-        # We need to re-add the shared variables to the new sample point
-        a_point = DictToArrayBijection.rmap(apoint)
-        new_point = {}
-        for name in point.keys():
-            shared_value = self.shared.get(name, None)
-            if shared_value is not None:
-                new_point[name] = shared_value.get_value()
-            else:
-                new_point[name] = a_point[name]
+            array = DictToArrayBijection.map(point)
+            apoint = self.astep(array)
+            if not isinstance(apoint, RaveledVars):
+                # We assume that the mapping has stayed the same
+                apoint = RaveledVars(apoint, array.point_map_info)
+            return DictToArrayBijection.rmap(apoint)
 
-        if self.generates_stats:
-            return new_point, stats
-
-        return new_point
-
-    def astep(self, apoint: RaveledVars):
+    def astep(self, apoint):
         raise NotImplementedError()
 
 
@@ -287,8 +254,25 @@ def __init__(
         super().__init__(vars, func._extra_vars_shared, blocked)
 
     def step(self, point):
-        self._logp_dlogp_func._extra_are_set = True
-        return super().step(point)
+        self._logp_dlogp_func.set_extra_values(point)
+
+        array = DictToArrayBijection.map(point)
+
+        stats = None
+        if self.generates_stats:
+            apoint, stats = self.astep(array)
+        else:
+            apoint = self.astep(array)
+
+        if not isinstance(apoint, RaveledVars):
+            # We assume that the mapping has stayed the same
+            apoint = RaveledVars(apoint, array.point_map_info)
+
+        point = DictToArrayBijection.rmap(apoint)
+
+        if stats is not None:
+            return point, stats
+        return point
 
     def astep(self, apoint):
         raise NotImplementedError()
diff --git a/pymc3/step_methods/hmc/base_hmc.py b/pymc3/step_methods/hmc/base_hmc.py
index 21a93c15bf..2de1b2bd1f 100644
--- a/pymc3/step_methods/hmc/base_hmc.py
+++ b/pymc3/step_methods/hmc/base_hmc.py
@@ -85,6 +85,8 @@ def __init__(
         if vars is None:
             vars = self._model.cont_vars
 
+        # vars = inputvars(vars)
+
         super().__init__(vars, blocked=blocked, model=self._model, dtype=dtype, **aesara_kwargs)
 
         self.adapt_step_size = adapt_step_size
@@ -93,7 +95,7 @@ def __init__(
 
         # We're using the initial/test point to determine the (initial) step
         # size.
-        # TODO: If the dimensions of these terms change, the step size
+        # XXX: If the dimensions of these terms change, the step size
         # dimension-scaling should change as well, no?
         test_point = self._model.test_point
         continuous_vars = [test_point[v.name] for v in self._model.cont_vars]
diff --git a/pymc3/step_methods/hmc/quadpotential.py b/pymc3/step_methods/hmc/quadpotential.py
index 40f542a70f..541821b742 100644
--- a/pymc3/step_methods/hmc/quadpotential.py
+++ b/pymc3/step_methods/hmc/quadpotential.py
@@ -120,8 +120,8 @@ def raise_ok(self, map_info=None):
 
         Parameters
         ----------
-        map_info: List of (name, shape, dtype)
-            List tuples with variable name, shape, and dtype.
+        vmap: list of blocking.VarMap
+            List of `VarMap`s, which are namedtuples with var, slc, shp, dtyp
 
         Raises
         ------
@@ -245,8 +245,8 @@ def raise_ok(self, map_info):
 
         Parameters
         ----------
-        map_info: List of (name, shape, dtype)
-            List tuples with variable name, shape, and dtype.
+        vmap: List of tuples (var, )
+            List of `VarMap`s, which are namedtuples with var, slc, shp, dtyp
 
         Raises
         ------
diff --git a/pymc3/step_methods/metropolis.py b/pymc3/step_methods/metropolis.py
index 94f2e345dc..901948600c 100644
--- a/pymc3/step_methods/metropolis.py
+++ b/pymc3/step_methods/metropolis.py
@@ -23,8 +23,9 @@
 
 import pymc3 as pm
 
-from pymc3.aesaraf import floatX, rvs_to_value_vars
-from pymc3.blocking import DictToArrayBijection, RaveledVars
+from pymc3.aesaraf import floatX
+from pymc3.blocking import DictToArrayBijection
+from pymc3.distributions import draw_values
 from pymc3.step_methods.arraystep import (
     ArrayStep,
     ArrayStepShared,
diff --git a/pymc3/step_methods/mlda.py b/pymc3/step_methods/mlda.py
index a155993fef..77a9e76b84 100644
--- a/pymc3/step_methods/mlda.py
+++ b/pymc3/step_methods/mlda.py
@@ -26,7 +26,7 @@
 import pymc3 as pm
 
 from pymc3.blocking import DictToArrayBijection
-from pymc3.model import Model, Point
+from pymc3.model import Model
 from pymc3.step_methods.arraystep import ArrayStepShared, Competence, metrop_select
 from pymc3.step_methods.compound import CompoundStep
 from pymc3.step_methods.metropolis import (
@@ -746,8 +746,7 @@ def astep(self, q0):
 
         # Call the recursive DA proposal to get proposed sample
         # and convert dict -> numpy array
-        pre_q = self.proposal_dist(q0_dict)
-        q = DictToArrayBijection.map(pre_q)
+        q = DictToArrayBijection.map(self.proposal_dist(q0_dict))
 
         # Evaluate MLDA acceptance log-ratio
         # If proposed sample from lower levels is the same as current one,
diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
index dfb215aa53..aa838d18df 100644
--- a/pymc3/tests/test_distributions.py
+++ b/pymc3/tests/test_distributions.py
@@ -36,7 +36,7 @@
 
 import pymc3 as pm
 
-from pymc3.aesaraf import change_rv_size, floatX
+from pymc3.aesaraf import floatX
 from pymc3.distributions import (
     AR1,
     CAR,
diff --git a/pymc3/tests/test_model.py b/pymc3/tests/test_model.py
index 2e7c8b18aa..ff43746865 100644
--- a/pymc3/tests/test_model.py
+++ b/pymc3/tests/test_model.py
@@ -32,6 +32,7 @@
 import pymc3 as pm
 
 from pymc3 import Deterministic, Potential
+from pymc3.blocking import RaveledVars
 from pymc3.distributions import Normal, transforms
 from pymc3.model import ValueGradFunction
 
@@ -217,7 +218,7 @@ class TestValueGradFunction(unittest.TestCase):
     def test_no_extra(self):
         a = at.vector("a")
         a.tag.test_value = np.zeros(3, dtype=a.dtype)
-        f_grad = ValueGradFunction([a.sum()], [a], {}, mode="FAST_COMPILE")
+        f_grad = ValueGradFunction([a.sum()], [a], [], mode="FAST_COMPILE")
         assert f_grad._extra_vars == []
 
     def test_invalid_type(self):
@@ -279,20 +280,6 @@ def test_grad(self):
         assert val == 21
         npt.assert_allclose(grad, [5, 5, 5, 1, 1, 1, 1, 1, 1])
 
-    def test_bij(self):
-        self.f_grad.set_extra_values({"extra1": 5})
-        array = np.ones(self.f_grad.size, dtype=self.f_grad.dtype)
-        point = self.f_grad.array_to_dict(array)
-        assert len(point) == 2
-        npt.assert_allclose(point["val1"], 1)
-        npt.assert_allclose(point["val2"], 1)
-
-        array2 = self.f_grad.dict_to_array(point)
-        npt.assert_allclose(array2, array)
-        point_ = self.f_grad.array_to_full_dict(array)
-        assert len(point_) == 3
-        assert point_["extra1"] == 5
-
     @pytest.mark.xfail(reason="Missing distributions")
     def test_edge_case(self):
         # Edge case discovered in #2948
@@ -376,7 +363,7 @@ def test_multiple_observed_rv():
     assert not model["x"] in model.value_vars
 
 
-@pytest.mark.xfail(reason="Functions depend on deprecated dshape/dsize")
+# @pytest.mark.xfail(reason="Functions depend on deprecated dshape/dsize")
 def test_tempered_logp_dlogp():
     with pm.Model() as model:
         pm.Normal("x")
diff --git a/pymc3/tuning/starting.py b/pymc3/tuning/starting.py
index 0b70716fc5..893f11b6b4 100644
--- a/pymc3/tuning/starting.py
+++ b/pymc3/tuning/starting.py
@@ -29,7 +29,7 @@
 import pymc3 as pm
 
 from pymc3.aesaraf import inputvars
-from pymc3.blocking import DictToArrayBijection, RaveledVars
+from pymc3.blocking import DictToArrayBijection
 from pymc3.model import Point, modelcontext
 from pymc3.util import get_default_varnames, get_var_name
 from pymc3.vartypes import discrete_types, typefilter
@@ -102,22 +102,14 @@ def find_MAP(
 
     start = Point(start, model=model)
 
+    logp_func = DictToArrayBijection.mapf(model.fastlogp_nojac)
     x0 = DictToArrayBijection.map(start)
 
-    # TODO: If the mapping is fixed, we can simply create graphs for the
-    # mapping and avoid all this bijection overhead
-    def logp_func(x):
-        return DictToArrayBijection.mapf(model.fastlogp_nojac)(RaveledVars(x, x0.point_map_info))
-
     try:
         # This might be needed for calls to `dlogp_func`
         # start_map_info = tuple((v.name, v.shape, v.dtype) for v in vars)
 
-        def dlogp_func(x):
-            return DictToArrayBijection.mapf(model.fastdlogp_nojac(vars))(
-                RaveledVars(x, x0.point_map_info)
-            )
-
+        dlogp_func = DictToArrayBijection.mapf(model.fastdlogp_nojac(vars))
         compute_gradient = True
     except (AttributeError, NotImplementedError, tg.NullTypeGradError):
         compute_gradient = False
@@ -157,7 +149,10 @@ def dlogp_func(x):
     vars = get_default_varnames(
         [v.tag.value_var for v in model.unobserved_RVs], include_transformed
     )
-    mx = {var.name: value for var, value in zip(vars, model.fastfn(vars)(bij.rmap(mx0)))}
+    mx = {
+        var.name: value
+        for var, value in zip(vars, model.fastfn(vars)(DictToArrayBijection.rmap(mx0)))
+    }
 
     if return_raw:
         return mx, opt_result
diff --git a/pymc3/variational/opvi.py b/pymc3/variational/opvi.py
index f2fe93530c..024b121a91 100644
--- a/pymc3/variational/opvi.py
+++ b/pymc3/variational/opvi.py
@@ -59,6 +59,7 @@
 
 from pymc3.aesaraf import at_rng, identity
 from pymc3.backends import NDArray
+from pymc3.blocking import ArrayOrdering, VarMap
 from pymc3.model import modelcontext
 from pymc3.util import (
     WithMemoization,
@@ -959,7 +960,7 @@ def __init_group__(self, group):
         self.group = [get_transformed(var) for var in self.group]
 
         # XXX: This needs to be refactored
-        # self.ordering = ArrayOrdering([])
+        self.ordering = ArrayOrdering([])
         self.replacements = dict()
         for var in self.group:
             if var.type.numpy_dtype.name in discrete_types:

From d160316cc2c59bb4ded477dd3302ad554a6f7da5 Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Wed, 3 Feb 2021 19:34:58 -0600
Subject: [PATCH 03/44] Remove newly deprecated classes and functions

Classes and functions removed:
- PyMC3Variable
- ObservedRV
- FreeRV
- MultiObservedRV
- TransformedRV
- ArrayOrdering
- VarMap
- DataMap
- _DrawValuesContext
- _DrawValuesContextBlocker
- is_fast_drawable
- _compile_theano_function
- vectorize_theano_function
- get_vectorize_signature
- _draw_value
- draw_values
- generate_samples
- fast_sample_posterior_predictive

Modules removed:
- pymc3.distributions.posterior_predictive
- pymc3.tests.test_random
---
 pymc3/blocking.py                         |  17 +-
 pymc3/distributions/__init__.py           |  33 ++-
 pymc3/distributions/continuous.py         | 124 +++++++++-
 pymc3/distributions/discrete.py           | 118 ++++++++-
 pymc3/distributions/distribution.py       |  24 +-
 pymc3/distributions/multivariate.py       | 120 ++++++++-
 pymc3/distributions/timeseries.py         |   6 +-
 pymc3/distributions/transforms.py         |  82 ++++++-
 pymc3/model.py                            | 287 ++++------------------
 pymc3/model_graph.py                      |   6 +-
 pymc3/sampling.py                         |   3 +-
 pymc3/step_methods/arraystep.py           |   2 +-
 pymc3/step_methods/gibbs.py               |   3 -
 pymc3/step_methods/hmc/quadpotential.py   |   8 +-
 pymc3/step_methods/metropolis.py          |  21 +-
 pymc3/step_methods/sgmcmc.py              |   4 +-
 pymc3/tests/test_distributions_random.py  |   6 +-
 pymc3/tests/test_sampling.py              |  25 +-
 pymc3/tests/test_variational_inference.py |   4 +-
 pymc3/util.py                             |   6 +-
 pymc3/variational/opvi.py                 |   3 +-
 21 files changed, 508 insertions(+), 394 deletions(-)

diff --git a/pymc3/blocking.py b/pymc3/blocking.py
index ec75d81082..332edceed8 100644
--- a/pymc3/blocking.py
+++ b/pymc3/blocking.py
@@ -23,26 +23,11 @@
 
 import numpy as np
 
-__all__ = ["ArrayOrdering", "DictToArrayBijection"]
+__all__ = ["DictToArrayBijection"]
 
 # `point_map_info` is a tuple of tuples containing `(name, shape, dtype)` for
 # each of the raveled variables.
 RaveledVars = collections.namedtuple("RaveledVars", "data, point_map_info")
-VarMap = collections.namedtuple("VarMap", "var, slc, shp, dtyp")
-DataMap = collections.namedtuple("DataMap", "list_ind, slc, shp, dtype, name")
-
-
-class ArrayOrdering:
-    """
-
-            slc = slice(self.size, self.size + var.dsize)
-            varmap = VarMap(name, slc, var.dshape, var.dtype)
-            self.vmap.append(varmap)
-            self.by_name[name] = varmap
-            self.size += var.dsize
-
-    def __getitem__(self, key):
-        return self.by_name[key]
 
 
 class DictToArrayBijection:
diff --git a/pymc3/distributions/__init__.py b/pymc3/distributions/__init__.py
index 648f35d392..bd4ab5f1c3 100644
--- a/pymc3/distributions/__init__.py
+++ b/pymc3/distributions/__init__.py
@@ -380,8 +380,7 @@ def logpt_sum(rv_var: TensorVariable, rv_value: Optional[TensorVariable] = None,
     return aet.sum(logpt(rv_var, rv_value, **kwargs))
 
 
-# from pymc3.distributions import timeseries
-from pymc3.distributions import shape_utils, transforms
+from pymc3.distributions import shape_utils, timeseries, transforms
 from pymc3.distributions.bart import BART
 from pymc3.distributions.bound import Bound
 from pymc3.distributions.continuous import (
@@ -443,8 +442,6 @@ def logpt_sum(rv_var: TensorVariable, rv_value: Optional[TensorVariable] = None,
     Discrete,
     Distribution,
     NoDistribution,
-    draw_values,
-    generate_samples,
 )
 from pymc3.distributions.mixture import Mixture, MixtureSameFamily, NormalMixture
 from pymc3.distributions.multivariate import (
@@ -462,15 +459,15 @@ def logpt_sum(rv_var: TensorVariable, rv_value: Optional[TensorVariable] = None,
     WishartBartlett,
 )
 from pymc3.distributions.simulator import Simulator
+from pymc3.distributions.timeseries import (
+    AR,
+    AR1,
+    GARCH11,
+    GaussianRandomWalk,
+    MvGaussianRandomWalk,
+    MvStudentTRandomWalk,
+)
 
-# from pymc3.distributions.timeseries import (
-#     AR,
-#     AR1,
-#     GARCH11,
-#     GaussianRandomWalk,
-#     MvGaussianRandomWalk,
-#     MvStudentTRandomWalk,
-# )
 __all__ = [
     "Uniform",
     "Flat",
@@ -528,13 +525,13 @@ def logpt_sum(rv_var: TensorVariable, rv_value: Optional[TensorVariable] = None,
     "WishartBartlett",
     "LKJCholeskyCov",
     "LKJCorr",
-    # "AR1",
-    # "AR",
+    "AR1",
+    "AR",
     "AsymmetricLaplace",
-    # "GaussianRandomWalk",
-    # "MvGaussianRandomWalk",
-    # "MvStudentTRandomWalk",
-    # "GARCH11",
+    "GaussianRandomWalk",
+    "MvGaussianRandomWalk",
+    "MvStudentTRandomWalk",
+    "GARCH11",
     "SkewNormal",
     "Mixture",
     "NormalMixture",
diff --git a/pymc3/distributions/continuous.py b/pymc3/distributions/continuous.py
index 23012dc9a1..535974b7d1 100644
--- a/pymc3/distributions/continuous.py
+++ b/pymc3/distributions/continuous.py
@@ -40,7 +40,6 @@
     SplineWrapper,
     betaln,
     bound,
-    clipped_beta_rvs,
     gammaln,
     i0e,
     incomplete_beta,
@@ -804,7 +803,25 @@ def dist(cls, sigma=None, tau=None, sd=None, *args, **kwargs):
 
         return super().dist([0.0, sigma], **kwargs)
 
-    def logp(value, loc, sigma):
+        Parameters
+        ----------
+        point: dict, optional
+            Dict of variable values on which random values are to be
+            conditioned (uses default point if not specified).
+        size: int, optional
+            Desired size of random sample (returns one sample if not
+            specified).
+
+        Returns
+        -------
+        array
+        """
+        # sigma = draw_values([self.sigma], point=point, size=size)[0]
+        # return generate_samples(
+        #     stats.halfnorm.rvs, loc=0.0, scale=sigma, dist_shape=self.shape, size=size
+        # )
+
+    def logp(self, value):
         """
         Calculate log-probability of HalfNormal distribution at specified value.
 
@@ -1180,8 +1197,25 @@ def get_alpha_beta(self, alpha=None, beta=None, mu=None, sigma=None):
 
         return alpha, beta
 
-    def _distr_parameters_for_repr(self):
-        return ["alpha", "beta"]
+    def random(self, point=None, size=None):
+        """
+        Draw random values from Beta distribution.
+
+        Parameters
+        ----------
+        point: dict, optional
+            Dict of variable values on which random values are to be
+            conditioned (uses default point if not specified).
+        size: int, optional
+            Desired size of random sample (returns one sample if not
+            specified).
+
+        Returns
+        -------
+        array
+        """
+        # alpha, beta = draw_values([self.alpha, self.beta], point=point, size=size)
+        # return generate_samples(clipped_beta_rvs, alpha, beta, dist_shape=self.shape, size=size)
 
     def logp(value, alpha, beta):
         """
@@ -1396,7 +1430,29 @@ def dist(cls, lam, *args, **kwargs):
         assert_negative_support(lam, "lam", "Exponential")
         return super().dist([lam], **kwargs)
 
-    def logp(value, lam):
+    def random(self, point=None, size=None):
+        """
+        Draw random values from Exponential distribution.
+
+        Parameters
+        ----------
+        point: dict, optional
+            Dict of variable values on which random values are to be
+            conditioned (uses default point if not specified).
+        size: int, optional
+            Desired size of random sample (returns one sample if not
+            specified).
+
+        Returns
+        -------
+        array
+        """
+        # lam = draw_values([self.lam], point=point, size=size)[0]
+        # return generate_samples(
+        #     np.random.exponential, scale=1.0 / lam, dist_shape=self.shape, size=size
+        # )
+
+    def logp(self, value):
         """
         Calculate log-probability of Exponential distribution at specified value.
 
@@ -2183,8 +2239,12 @@ def dist(cls, alpha, beta, *args, **kwargs):
         # median = alpha
         # mode = alpha
 
-        assert_negative_support(beta, "beta", "Cauchy")
-        return super().dist([alpha, beta], **kwargs)
+        Returns
+        -------
+        array
+        """
+        # alpha, beta = draw_values([self.alpha, self.beta], point=point, size=size)
+        # return generate_samples(self._random, alpha, beta, dist_shape=self.shape, size=size)
 
     def logp(value, alpha, beta):
         """
@@ -2271,7 +2331,31 @@ def dist(cls, beta, *args, **kwargs):
         assert_negative_support(beta, "beta", "HalfCauchy")
         return super().dist([0.0, beta], **kwargs)
 
-    def logp(value, loc, beta):
+    def _random(self, beta, size=None):
+        u = np.random.uniform(size=size)
+        return beta * np.abs(np.tan(np.pi * (u - 0.5)))
+
+    def random(self, point=None, size=None):
+        """
+        Draw random values from HalfCauchy distribution.
+
+        Parameters
+        ----------
+        point: dict, optional
+            Dict of variable values on which random values are to be
+            conditioned (uses default point if not specified).
+        size: int, optional
+            Desired size of random sample (returns one sample if not
+            specified).
+
+        Returns
+        -------
+        array
+        """
+        # beta = draw_values([self.beta], point=point, size=size)[0]
+        # return generate_samples(self._random, beta, dist_shape=self.shape, size=size)
+
+    def logp(self, value):
         """
         Calculate log-probability of HalfCauchy distribution at specified value.
 
@@ -2558,9 +2642,27 @@ def _get_alpha_beta(cls, alpha, beta, mu, sigma):
 
         return alpha, beta
 
-    @classmethod
-    def _distr_parameters_for_repr(self):
-        return ["alpha", "beta"]
+    def random(self, point=None, size=None):
+        """
+        Draw random values from InverseGamma distribution.
+
+        Parameters
+        ----------
+        point: dict, optional
+            Dict of variable values on which random values are to be
+            conditioned (uses default point if not specified).
+        size: int, optional
+            Desired size of random sample (returns one sample if not
+            specified).
+
+        Returns
+        -------
+        array
+        """
+        # alpha, beta = draw_values([self.alpha, self.beta], point=point, size=size)
+        # return generate_samples(
+        #     stats.invgamma.rvs, a=alpha, scale=beta, dist_shape=self.shape, size=size
+        # )
 
     def logp(value, alpha, beta):
         """
diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py
index 5c941d58a6..f897f432e8 100644
--- a/pymc3/distributions/discrete.py
+++ b/pymc3/distributions/discrete.py
@@ -35,7 +35,7 @@
     normal_lcdf,
 )
 from pymc3.distributions.distribution import Discrete
-from pymc3.math import log1mexp, logaddexp, logsumexp, sigmoid, tround
+from pymc3.math import log1mexp, log1pexp, logaddexp, logit, logsumexp, sigmoid, tround
 
 __all__ = [
     "Binomial",
@@ -281,6 +281,7 @@ def random(self, point=None, size=None):
         # return generate_samples(
         #     self._random, alpha=alpha, beta=beta, n=n, dist_shape=self.shape, size=size
         # )
+        pass
 
     def logp(self, value):
         r"""
@@ -384,11 +385,41 @@ class Bernoulli(Discrete):
     """
     rv_op = bernoulli
 
-    @classmethod
-    def dist(cls, p=None, logit_p=None, *args, **kwargs):
-        p = at.as_tensor_variable(floatX(p))
-        # mode = at.cast(tround(p), "int8")
-        return super().dist([p], **kwargs)
+    def __init__(self, p=None, logit_p=None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if sum(int(var is None) for var in [p, logit_p]) != 1:
+            raise ValueError("Specify one of p and logit_p")
+        if p is not None:
+            self._is_logit = False
+            self.p = p = aet.as_tensor_variable(floatX(p))
+            self._logit_p = logit(p)
+        else:
+            self._is_logit = True
+            self.p = aet.nnet.sigmoid(floatX(logit_p))
+            self._logit_p = aet.as_tensor_variable(logit_p)
+
+        self.mode = aet.cast(tround(self.p), "int8")
+
+    def random(self, point=None, size=None):
+        r"""
+        Draw random values from Bernoulli distribution.
+
+        Parameters
+        ----------
+        point: dict, optional
+            Dict of variable values on which random values are to be
+            conditioned (uses default point if not specified).
+        size: int, optional
+            Desired size of random sample (returns one sample if not
+            specified).
+
+        Returns
+        -------
+        array
+        """
+        # p = draw_values([self.p], point=point, size=size)[0]
+        # return generate_samples(stats.bernoulli.rvs, p, dist_shape=self.shape, size=size)
+        pass
 
     def logp(value, p):
         r"""
@@ -529,6 +560,7 @@ def random(self, point=None, size=None):
         """
         # q, beta = draw_values([self.q, self.beta], point=point, size=size)
         # return generate_samples(self._random, q, beta, dist_shape=self.shape, size=size)
+        pass
 
     def logp(self, value):
         r"""
@@ -627,11 +659,31 @@ class Poisson(Discrete):
     """
     rv_op = poisson
 
-    @classmethod
-    def dist(cls, mu, *args, **kwargs):
-        mu = at.as_tensor_variable(floatX(mu))
-        # mode = intX(at.floor(mu))
-        return super().dist([mu], *args, **kwargs)
+    def __init__(self, mu, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.mu = mu = aet.as_tensor_variable(floatX(mu))
+        self.mode = intX(aet.floor(mu))
+
+    def random(self, point=None, size=None):
+        r"""
+        Draw random values from Poisson distribution.
+
+        Parameters
+        ----------
+        point: dict, optional
+            Dict of variable values on which random values are to be
+            conditioned (uses default point if not specified).
+        size: int, optional
+            Desired size of random sample (returns one sample if not
+            specified).
+
+        Returns
+        -------
+        array
+        """
+        # mu = draw_values([self.mu], point=point, size=size)[0]
+        # return generate_samples(stats.poisson.rvs, mu, dist_shape=self.shape, size=size)
+        pass
 
     def logp(value, mu):
         r"""
@@ -768,7 +820,42 @@ def get_mu_alpha(cls, mu=None, alpha=None, p=None, n=None):
         elif mu is not None:
             raise ValueError("Incompatible parametrization. Can't specify both mu and p.")
 
-        return n, p
+        return mu, alpha
+
+    def random(self, point=None, size=None):
+        r"""
+        Draw random values from NegativeBinomial distribution.
+
+        Parameters
+        ----------
+        point: dict, optional
+            Dict of variable values on which random values are to be
+            conditioned (uses default point if not specified).
+        size: int, optional
+            Desired size of random sample (returns one sample if not
+            specified).
+
+        Returns
+        -------
+        array
+        """
+        # mu, alpha = draw_values([self.mu, self.alpha], point=point, size=size)
+        # g = generate_samples(self._random, mu=mu, alpha=alpha, dist_shape=self.shape, size=size)
+        # g[g == 0] = np.finfo(float).eps  # Just in case
+        # return np.asarray(stats.poisson.rvs(g)).reshape(g.shape)
+        pass
+
+    def _random(self, mu, alpha, size):
+        r"""Wrapper around stats.gamma.rvs that converts NegativeBinomial's
+        parametrization to scipy.gamma. All parameter arrays should have
+        been broadcasted properly by generate_samples at this point and size is
+        the scipy.rvs representation.
+        """
+        return stats.gamma.rvs(
+            a=alpha,
+            scale=mu / alpha,
+            size=size,
+        )
 
     def logp(value, n, p):
         r"""
@@ -889,6 +976,7 @@ def random(self, point=None, size=None):
         """
         # p = draw_values([self.p], point=point, size=size)[0]
         # return generate_samples(np.random.geometric, p, dist_shape=self.shape, size=size)
+        pass
 
     def logp(self, value):
         r"""
@@ -1006,6 +1094,7 @@ def random(self, point=None, size=None):
 
         # N, k, n = draw_values([self.N, self.k, self.n], point=point, size=size)
         # return generate_samples(self._random, N, k, n, dist_shape=self.shape, size=size)
+        pass
 
     def _random(self, M, n, N, size=None):
         r"""Wrapper around scipy stat's hypergeom.rvs"""
@@ -1159,6 +1248,7 @@ def random(self, point=None, size=None):
         """
         # lower, upper = draw_values([self.lower, self.upper], point=point, size=size)
         # return generate_samples(self._random, lower, upper, dist_shape=self.shape, size=size)
+        pass
 
     def logp(self, value):
         r"""
@@ -1338,6 +1428,7 @@ def random(self, point=None, size=None):
         #     return np.full(size, fill_value=c, dtype=dtype)
         #
         # return generate_samples(_random, c=c, dist_shape=self.shape, size=size).astype(dtype)
+        pass
 
     def logp(self, value):
         r"""
@@ -1439,6 +1530,7 @@ def random(self, point=None, size=None):
         # g = generate_samples(stats.poisson.rvs, theta, dist_shape=self.shape, size=size)
         # g, psi = broadcast_distribution_samples([g, psi], size=size)
         # return g * (np.random.random(g.shape) < psi)
+        pass
 
     def logp(self, value):
         r"""
@@ -1571,6 +1663,7 @@ def random(self, point=None, size=None):
         # g = generate_samples(stats.binom.rvs, n, p, dist_shape=self.shape, size=size)
         # g, psi = broadcast_distribution_samples([g, psi], size=size)
         # return g * (np.random.random(g.shape) < psi)
+        pass
 
     def logp(self, value):
         r"""
@@ -1727,6 +1820,7 @@ def random(self, point=None, size=None):
         # g[g == 0] = np.finfo(float).eps  # Just in case
         # g, psi = broadcast_distribution_samples([g, psi], size=size)
         # return stats.poisson.rvs(g) * (np.random.random(g.shape) < psi)
+        pass
 
     def _random(self, mu, alpha, size):
         r"""Wrapper around stats.gamma.rvs that converts NegativeBinomial's
diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py
index dee5ceb129..bb4a1681e0 100644
--- a/pymc3/distributions/distribution.py
+++ b/pymc3/distributions/distribution.py
@@ -36,26 +36,11 @@
 import aesara.tensor as aet
 import numpy as np
 
-from aesara import function
 from aesara.compile.sharedvalue import SharedVariable
 from aesara.graph.basic import Constant
 from aesara.tensor.var import TensorVariable
-from cachetools import LRUCache, cached
-
-from pymc3.distributions.shape_utils import (
-    broadcast_dist_samples_shape,
-    get_broadcastable_dist_samples,
-    to_tuple,
-)
-from pymc3.model import (
-    ContextMeta,
-    FreeRV,
-    Model,
-    MultiObservedRV,
-    ObservedRV,
-    build_named_node_tree,
-)
-from pymc3.util import get_repr_for_variable, get_var_name, hash_key
+
+from pymc3.util import get_repr_for_variable
 from pymc3.vartypes import string_types
 
 __all__ = [
@@ -64,8 +49,6 @@
     "Continuous",
     "Discrete",
     "NoDistribution",
-    "draw_values",
-    "generate_samples",
 ]
 
 vectorized_ppc = contextvars.ContextVar(
@@ -173,9 +156,6 @@ def __new__(cls, name, *args, **kwargs):
 
         data = kwargs.pop("observed", None)
 
-        if isinstance(data, ObservedRV) or isinstance(data, FreeRV):
-            raise TypeError("observed needs to be data but got: {}".format(type(data)))
-
         total_size = kwargs.pop("total_size", None)
 
         dims = kwargs.pop("dims", None)
diff --git a/pymc3/distributions/multivariate.py b/pymc3/distributions/multivariate.py
index 17bc671a29..153fdf156e 100644
--- a/pymc3/distributions/multivariate.py
+++ b/pymc3/distributions/multivariate.py
@@ -44,6 +44,7 @@
 from pymc3.distributions.continuous import ChiSquared, Normal
 from pymc3.distributions.dist_math import bound, factln, logpow
 from pymc3.distributions.distribution import Continuous, Discrete
+from pymc3.distributions.shape_utils import to_tuple
 from pymc3.distributions.special import gammaln, multigammaln
 from pymc3.math import kron_diag, kron_dot, kron_solve_lower, kronecker
 
@@ -179,11 +180,57 @@ class MvNormal(Continuous):
     """
     rv_op = multivariate_normal
 
-    @classmethod
-    def dist(cls, mu, cov=None, tau=None, chol=None, lower=True, **kwargs):
-        mu = at.as_tensor_variable(mu)
-        cov = quaddist_matrix(cov, chol, tau, lower)
-        return super().dist([mu, cov], **kwargs)
+    def __init__(self, mu, cov=None, tau=None, chol=None, lower=True, *args, **kwargs):
+        super().__init__(mu=mu, cov=cov, tau=tau, chol=chol, lower=lower, *args, **kwargs)
+        self.mean = self.median = self.mode = self.mu = self.mu
+
+    def random(self, point=None, size=None):
+        """
+        Draw random values from Multivariate Normal distribution.
+
+        Parameters
+        ----------
+        point: dict, optional
+            Dict of variable values on which random values are to be
+            conditioned (uses default point if not specified).
+        size: int, optional
+            Desired size of random sample (returns one sample if not
+            specified).
+
+        Returns
+        -------
+        array
+        """
+        # size = to_tuple(size)
+        #
+        # param_attribute = getattr(self, "chol_cov" if self._cov_type == "chol" else self._cov_type)
+        # mu, param = draw_values([self.mu, param_attribute], point=point, size=size)
+        #
+        # dist_shape = to_tuple(self.shape)
+        # output_shape = size + dist_shape
+        #
+        # # Simple, there can be only be 1 batch dimension, only available from `mu`.
+        # # Insert it into `param` before events, if there is a sample shape in front.
+        # if param.ndim > 2 and dist_shape[:-1]:
+        #     param = param.reshape(size + (1,) + param.shape[-2:])
+        #
+        # mu = broadcast_dist_samples_to(to_shape=output_shape, samples=[mu], size=size)[0]
+        # param = np.broadcast_to(param, shape=output_shape + dist_shape[-1:])
+        #
+        # assert mu.shape == output_shape
+        # assert param.shape == output_shape + dist_shape[-1:]
+        #
+        # if self._cov_type == "cov":
+        #     chol = np.linalg.cholesky(param)
+        # elif self._cov_type == "chol":
+        #     chol = param
+        # else:  # tau -> chol -> swapaxes (chol, -1, -2) -> inv ...
+        #     lower_chol = np.linalg.cholesky(param)
+        #     upper_chol = np.swapaxes(lower_chol, -1, -2)
+        #     chol = np.linalg.inv(upper_chol)
+        #
+        # standard_normal = np.random.standard_normal(output_shape)
+        # return mu + np.einsum("...ij,...j->...i", chol, standard_normal)
 
     def logp(value, mu, cov):
         """
@@ -296,7 +343,7 @@ def random(self, point=None, size=None):
         # chi2_samples = chi2_samples.reshape(chi2_samples.shape + (1,) * len(self.shape))
         # return (samples / np.sqrt(chi2_samples / nu)) + mu
 
-    def logp(value, nu, cov):
+    def logp(self, value):
         """
         Calculate log-probability of Multivariate Student's T distribution
         at specified value.
@@ -458,7 +505,62 @@ def dist(cls, n, p, *args, **kwargs):
         # mode = at.inc_subtensor(mode[inc_bool_arr.nonzero()], diff[inc_bool_arr.nonzero()])
         return super().dist([n, p], *args, **kwargs)
 
-    def logp(value, n, p):
+        # Thanks to the default shape handling done in generate_values, the last
+        # axis of n is a dummy axis that allows it to broadcast well with p
+        n = np.broadcast_to(n, size)
+        p = np.broadcast_to(p, size)
+        n = n[..., 0]
+
+        # np.random.multinomial needs `n` to be a scalar int and `p` a
+        # sequence so we semi flatten them and iterate over them
+        size_ = to_tuple(raw_size)
+        if p.ndim > len(size_) and p.shape[: len(size_)] == size_:
+            # p and n have the size_ prepend so we don't need it in np.random
+            n_ = n.reshape([-1])
+            p_ = p.reshape([-1, p.shape[-1]])
+            samples = np.array([np.random.multinomial(nn, pp) for nn, pp in zip(n_, p_)])
+            samples = samples.reshape(p.shape)
+        else:
+            # p and n don't have the size prepend
+            n_ = n.reshape([-1])
+            p_ = p.reshape([-1, p.shape[-1]])
+            samples = np.array(
+                [np.random.multinomial(nn, pp, size=size_) for nn, pp in zip(n_, p_)]
+            )
+            samples = np.moveaxis(samples, 0, -1)
+            samples = samples.reshape(size + p.shape)
+        # We cast back to the original dtype
+        return samples.astype(original_dtype)
+
+    def random(self, point=None, size=None):
+        """
+        Draw random values from Multinomial distribution.
+
+        Parameters
+        ----------
+        point: dict, optional
+            Dict of variable values on which random values are to be
+            conditioned (uses default point if not specified).
+        size: int, optional
+            Desired size of random sample (returns one sample if not
+            specified).
+
+        Returns
+        -------
+        array
+        """
+        # n, p = draw_values([self.n, self.p], point=point, size=size)
+        # samples = generate_samples(
+        #     self._random,
+        #     n,
+        #     p,
+        #     dist_shape=self.shape,
+        #     not_broadcast_kwargs={"raw_size": size},
+        #     size=size,
+        # )
+        # return samples
+
+    def logp(self, x):
         """
         Calculate log-probability of Multinomial distribution
         at specified value.
@@ -876,9 +978,9 @@ def WishartBartlett(name, S, nu, is_cholesky=False, return_cholesky=False, testv
 
     # L * A * A.T * L.T ~ Wishart(L*L.T, nu)
     if return_cholesky:
-        return pm.Deterministic(name, at.dot(L, A))
+        return pm.Deterministic(name, aet.dot(L, A))
     else:
-        return pm.Deterministic(name, at.dot(at.dot(at.dot(L, A), A.T), L.T))
+        return pm.Deterministic(name, aet.dot(aet.dot(aet.dot(L, A), A.T), L.T))
 
 
 def _lkj_normalizing_constant(eta, n):
diff --git a/pymc3/distributions/timeseries.py b/pymc3/distributions/timeseries.py
index 0c72550387..3213fbe1fb 100644
--- a/pymc3/distributions/timeseries.py
+++ b/pymc3/distributions/timeseries.py
@@ -140,7 +140,7 @@ def __init__(
             self.p = p
 
         self.constant = constant
-        self.rho = rho = at.as_tensor_variable(rho)
+        self.rho = rho = aet.as_tensor_variable(rho)
         self.init = init or Flat.dist()
 
     def logp(self, value):
@@ -212,9 +212,9 @@ def __init__(self, tau=None, init=None, sigma=None, mu=0.0, sd=None, *args, **kw
         self.tau = at.as_tensor_variable(tau)
         sigma = at.as_tensor_variable(sigma)
         self.sigma = self.sd = sigma
-        self.mu = at.as_tensor_variable(mu)
+        self.mu = aet.as_tensor_variable(mu)
         self.init = init or Flat.dist()
-        self.mean = at.as_tensor_variable(0.0)
+        self.mean = aet.as_tensor_variable(0.0)
 
     def _mu_and_sigma(self, mu, sigma):
         """Helper to get mu and sigma if they are high dimensional."""
diff --git a/pymc3/distributions/transforms.py b/pymc3/distributions/transforms.py
index 34a845665e..b6741af6a3 100644
--- a/pymc3/distributions/transforms.py
+++ b/pymc3/distributions/transforms.py
@@ -105,11 +105,83 @@ def __str__(self):
 
 
 class ElemwiseTransform(Transform):
-    def jacobian_det(self, rv_var, rv_value):
-        grad = at.reshape(
-            gradient(at.sum(self.backward(rv_var, rv_value)), [rv_value]), rv_value.shape
-        )
-        return at.log(at.abs_(grad))
+    def jacobian_det(self, x):
+        grad = aet.reshape(gradient(aet.sum(self.backward(x)), [x]), x.shape)
+        return aet.log(aet.abs_(grad))
+
+
+class TransformedDistribution(distribution.Distribution):
+    """A distribution that has been transformed from one space into another."""
+
+    def __init__(self, dist, transform, *args, **kwargs):
+        """
+        Parameters
+        ----------
+        dist: Distribution
+        transform: Transform
+        args, kwargs
+            arguments to Distribution"""
+        forward = transform.forward
+        testval = forward(dist.default())
+
+        self.dist = dist
+        self.transform_used = transform
+        # XXX: `FreeRV` no longer exists
+        v = None  # forward(FreeRV(name="v", distribution=dist))
+        self.type = v.type
+
+        super().__init__(v.shape.tag.test_value, v.dtype, testval, dist.defaults, *args, **kwargs)
+
+        if transform.name == "stickbreaking":
+            b = np.hstack(((np.atleast_1d(self.shape) == 1)[:-1], False))
+            # force the last dim not broadcastable
+            self.type = TensorType(v.dtype, b)
+
+    def logp(self, x):
+        """
+        Calculate log-probability of Transformed distribution at specified value.
+
+        Parameters
+        ----------
+        x: numeric
+            Value for which log-probability is calculated.
+
+        Returns
+        -------
+        TensorVariable
+        """
+        logp_nojac = self.logp_nojac(x)
+        jacobian_det = self.transform_used.jacobian_det(x)
+        if logp_nojac.ndim > jacobian_det.ndim:
+            logp_nojac = logp_nojac.sum(axis=-1)
+        return logp_nojac + jacobian_det
+
+    def logp_nojac(self, x):
+        """
+        Calculate log-probability of Transformed distribution at specified value
+        without jacobian term for transforms.
+
+        Parameters
+        ----------
+        x: numeric
+            Value for which log-probability is calculated.
+
+        Returns
+        -------
+        TensorVariable
+        """
+        return self.dist.logp(self.transform_used.backward(x))
+
+    def _repr_latex_(self, **kwargs):
+        # prevent TransformedDistributions from ending up in LaTeX representations
+        # of models
+        return None
+
+    def _distr_parameters_for_repr(self):
+        return []
+
+
+transform = Transform
 
 
 class Log(ElemwiseTransform):
diff --git a/pymc3/model.py b/pymc3/model.py
index 1171ee29df..d6181eae86 100644
--- a/pymc3/model.py
+++ b/pymc3/model.py
@@ -29,9 +29,8 @@
 
 from aesara.compile.sharedvalue import SharedVariable
 from aesara.gradient import grad
-from aesara.graph.basic import Apply, Variable
+from aesara.graph.basic import Variable
 from aesara.tensor.random.op import Observed, observed
-from aesara.tensor.type import TensorType
 from aesara.tensor.var import TensorVariable
 from pandas import Series
 
@@ -40,10 +39,10 @@
 from pymc3.aesaraf import generator, gradient, hessian, inputvars
 from pymc3.blocking import DictToArrayBijection, RaveledVars
 from pymc3.data import GenTensorVariable, Minibatch
-from pymc3.distributions import _get_scaling, change_rv_size, logpt, logpt_sum
+from pymc3.distributions import change_rv_size, logpt, logpt_sum
 from pymc3.exceptions import ImputationWarning
 from pymc3.math import flatten_list
-from pymc3.util import WithMemoization, get_transformed_name, get_var_name
+from pymc3.util import WithMemoization, get_var_name
 from pymc3.vartypes import continuous_types, discrete_types, isgenerator, typefilter
 
 __all__ = [
@@ -1102,8 +1101,7 @@ def flatten(self, vars=None, order=None, inputvar=None):
             )
             last_idx += arr_len
 
-        view = {vm.var: vm for vm in order.vmap}
-        flat_view = FlatView(inputvar, replacements, view)
+        flat_view = FlatView(inputvar, replacements)
 
         return flat_view
 
@@ -1393,71 +1391,45 @@ def __call__(self, *args, **kwargs):
 compilef = fastfn
 
 
-class FreeRV(Factor, PyMC3Variable):
-    """Unobserved random variable that a model is specified in terms of."""
+def pandas_to_array(data):
+    """Convert a pandas object to a NumPy array.
 
-    dshape = None  # type: Tuple[int, ...]
-    size = None  # type: int
-    distribution = None  # type: Optional[Distribution]
-    model = None  # type: Optional[Model]
+    XXX: When `data` is a generator, this will return a Aesara tensor!
 
-    def __init__(
-        self,
-        type=None,
-        owner=None,
-        index=None,
-        name=None,
-        distribution=None,
-        total_size=None,
-        model=None,
-    ):
-        """
-        Parameters
-        ----------
-        type: aesara type (optional)
-        owner: aesara owner (optional)
-        name: str
-        distribution: Distribution
-        model: Model
-        total_size: scalar Tensor (optional)
-            needed for upscaling logp
-        """
-        if type is None:
-            type = distribution.type
-        super().__init__(type, owner, index, name)
-
-        if distribution is not None:
-            self.dshape = tuple(distribution.shape)
-            self.dsize = int(np.prod(distribution.shape))
-            self.distribution = distribution
-            self.tag.test_value = (
-                np.ones(distribution.shape, distribution.dtype) * distribution.default()
-            )
-            self.logp_elemwiset = distribution.logp(self)
-            # The logp might need scaling in minibatches.
-            # This is done in `Factor`.
-            self.logp_sum_unscaledt = distribution.logp_sum(self)
-            self.logp_nojac_unscaledt = distribution.logp_nojac(self)
-            self.total_size = total_size
-            self.model = model
-            self.scaling = _get_scaling(total_size, self.shape, self.ndim)
-
-            incorporate_methods(
-                source=distribution,
-                destination=self,
-                methods=["random"],
-                wrapper=InstanceMethod,
-            )
-
-    @property
-    def init_value(self):
-        """Convenience attribute to return tag.test_value"""
-        return self.tag.test_value
-
-    rv_var = change_rv_size(rv_var, new_size)
-
-    if aesara.config.compute_test_value != "off":
-        test_value = getattr(rv_var.tag, "test_value", None)
+    """
+    if hasattr(data, "to_numpy") and hasattr(data, "isnull"):
+        # typically, but not limited to pandas objects
+        vals = data.to_numpy()
+        mask = data.isnull().to_numpy()
+        if mask.any():
+            # there are missing values
+            ret = np.ma.MaskedArray(vals, mask)
+        else:
+            ret = vals
+    elif isinstance(data, np.ndarray):
+        if isinstance(data, np.ma.MaskedArray):
+            if not data.mask.any():
+                # empty mask
+                ret = data.filled()
+            else:
+                # already masked and rightly so
+                ret = data
+        else:
+            # already a ndarray, but not masked
+            mask = np.isnan(data)
+            if np.any(mask):
+                ret = np.ma.MaskedArray(data, mask)
+            else:
+                # no masking required
+                ret = data
+    elif isinstance(data, Variable):
+        ret = data
+    elif sps.issparse(data):
+        ret = data
+    elif isgenerator(data):
+        ret = generator(data)
+    else:
+        ret = np.asarray(data)
 
         if test_value is not None:
             # We try to reuse the old test value
@@ -1546,120 +1518,6 @@ def make_obs_var(
 
     rv_var.tag.observations = data
 
-class ObservedRV(Factor, PyMC3Variable):
-    """Observed random variable that a model is specified in terms of.
-    Potentially partially observed.
-    """
-
-    def __init__(
-        self,
-        type=None,
-        owner=None,
-        index=None,
-        name=None,
-        data=None,
-        distribution=None,
-        total_size=None,
-        model=None,
-    ):
-        """
-        Parameters
-        ----------
-        type: aesara type (optional)
-        owner: aesara owner (optional)
-        name: str
-        distribution: Distribution
-        model: Model
-        total_size: scalar Tensor (optional)
-            needed for upscaling logp
-        """
-
-        if hasattr(data, "type") and isinstance(data.type, TensorType):
-            type = data.type
-
-        if type is None:
-            data = pandas_to_array(data)
-            if isinstance(data, Variable):
-                type = data.type
-            else:
-                type = TensorType(distribution.dtype, [s == 1 for s in data.shape])
-
-        self.observations = data
-
-        super().__init__(type, owner, index, name)
-
-        if distribution is not None:
-            data = aet.as_tensor(data, name, model, distribution)
-
-            self.missing_values = data.missing_values
-            self.logp_elemwiset = distribution.logp(data)
-            # The logp might need scaling in minibatches.
-            # This is done in `Factor`.
-            self.logp_sum_unscaledt = distribution.logp_sum(data)
-            self.logp_nojac_unscaledt = distribution.logp_nojac(data)
-            self.total_size = total_size
-            self.model = model
-            self.distribution = distribution
-
-            # make this RV a view on the combined missing/nonmissing array
-            Apply(aesara.compile.view_op, inputs=[data], outputs=[self])
-            self.tag.test_value = aesara.compile.view_op(data).tag.test_value.astype(self.dtype)
-            self.scaling = _get_scaling(total_size, data.shape, data.ndim)
-
-    @property
-    def init_value(self):
-        """Convenience attribute to return tag.test_value"""
-        return self.tag.test_value
-
-
-class MultiObservedRV(Factor):
-    """Observed random variable that a model is specified in terms of.
-    Potentially partially observed.
-    """
-
-    def __init__(self, name, data, distribution, total_size=None, model=None):
-        """
-        Parameters
-        ----------
-        type: aesara type (optional)
-        owner: aesara owner (optional)
-        name: str
-        distribution: Distribution
-        model: Model
-        total_size: scalar Tensor (optional)
-            needed for upscaling logp
-        """
-        self.name = name
-        self.data = {
-            name: aet.as_tensor(data, name, model, distribution) for name, data in data.items()
-        }
-
-        self.missing_values = [
-            datum.missing_values for datum in self.data.values() if datum.missing_values is not None
-        ]
-        self.logp_elemwiset = distribution.logp(**self.data)
-        # The logp might need scaling in minibatches.
-        # This is done in `Factor`.
-        self.logp_sum_unscaledt = distribution.logp_sum(**self.data)
-        self.logp_nojac_unscaledt = distribution.logp_nojac(**self.data)
-        self.total_size = total_size
-        self.model = model
-        self.distribution = distribution
-        self.scaling = _get_scaling(total_size, self.logp_elemwiset.shape, self.logp_elemwiset.ndim)
-
-    # Make hashable by id for draw_values
-    def __hash__(self):
-        return id(self)
-
-    def __eq__(self, other):
-        "Use object identity for MultiObservedRV equality."
-        # This is likely a Bad Thing, but changing it would break a lot of code.
-        return self is other
-
-    def __ne__(self, other):
-        return not self == other
-
-
 def _walk_up_rv(rv, formatting="plain"):
     """Walk up aesara graph to get inputs for deterministic RV."""
     all_rvs = []
@@ -1738,67 +1596,6 @@ def Potential(name, var, model=None):
     return var
 
 
-class TransformedRV(PyMC3Variable):
-    """
-    Parameters
-    ----------
-
-    type: aesara type (optional)
-    owner: aesara owner (optional)
-    name: str
-    distribution: Distribution
-    model: Model
-    total_size: scalar Tensor (optional)
-        needed for upscaling logp
-    """
-
-    def __init__(
-        self,
-        type=None,
-        owner=None,
-        index=None,
-        name=None,
-        distribution=None,
-        model=None,
-        transform=None,
-        total_size=None,
-    ):
-        if type is None:
-            type = distribution.type
-        super().__init__(type, owner, index, name)
-
-        self.transformation = transform
-
-        if distribution is not None:
-            self.model = model
-            self.distribution = distribution
-            self.dshape = tuple(distribution.shape)
-            self.dsize = int(np.prod(distribution.shape))
-
-            transformed_name = get_transformed_name(name, transform)
-
-            self.transformed = model.Var(
-                transformed_name, transform.apply(distribution), total_size=total_size
-            )
-
-            normalRV = transform.backward(self.transformed)
-
-            Apply(aesara.compile.view_op, inputs=[normalRV], outputs=[self])
-            self.tag.test_value = normalRV.tag.test_value
-            self.scaling = _get_scaling(total_size, self.shape, self.ndim)
-            incorporate_methods(
-                source=distribution,
-                destination=self,
-                methods=["random"],
-                wrapper=InstanceMethod,
-            )
-
-    @property
-    def init_value(self):
-        """Convenience attribute to return tag.test_value"""
-        return self.tag.test_value
-
-
 def as_iterargs(data):
     if isinstance(data, tuple):
         return data
@@ -1807,7 +1604,7 @@ def as_iterargs(data):
 
 
 def all_continuous(vars):
-    """Check that vars not include discrete variables or BART variables, excepting ObservedRVs."""
+    """Check that vars not include discrete variables or BART variables, excepting observed RVs."""
 
     vars_ = [var for var in vars if not (var.owner and isinstance(var.owner.op, Observed))]
     if any(
diff --git a/pymc3/model_graph.py b/pymc3/model_graph.py
index e35eaf1123..47f6625b17 100644
--- a/pymc3/model_graph.py
+++ b/pymc3/model_graph.py
@@ -17,7 +17,7 @@
 
 from aesara.compile.sharedvalue import SharedVariable
 from aesara.graph.basic import walk
-from aesara.tensor.random.op import RandomVariable
+from aesara.tensor.random.op import Observed
 from aesara.tensor.var import TensorVariable
 
 import pymc3 as pm
@@ -112,7 +112,7 @@ def update_input_map(key: str, val: Set[VarName]):
         for var_name in self.var_names:
             var = self.model[var_name]
             update_input_map(var_name, self.get_parents(var))
-            if hasattr(var.tag, "observations"):
+            if var.owner and isinstance(var.owner.op, Observed):
                 try:
                     obs_name = var.tag.observations.name
                     if obs_name:
@@ -128,7 +128,7 @@ def _make_node(self, var_name, graph, *, formatting: str = "plain"):
 
         # styling for node
         attrs = {}
-        if v.owner and isinstance(v.owner.op, RandomVariable) and hasattr(v.tag, "observations"):
+        if v.owner and isinstance(v.owner.op, Observed):
             attrs["style"] = "filled"
 
         # make Data be roundtangle, instead of rectangle
diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index ed4cb7838f..ff1b03f694 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -41,8 +41,7 @@
 from pymc3.backends.base import BaseTrace, MultiTrace
 from pymc3.backends.ndarray import NDArray
 from pymc3.blocking import DictToArrayBijection
-from pymc3.distributions.distribution import draw_values
-from pymc3.distributions.posterior_predictive import fast_sample_posterior_predictive
+from pymc3.distributions import change_rv_size, rv_ancestors, strip_observed
 from pymc3.exceptions import IncorrectArgumentsError, SamplingError
 from pymc3.model import Model, Point, modelcontext
 from pymc3.parallel_sampling import Draw, _cpu_count
diff --git a/pymc3/step_methods/arraystep.py b/pymc3/step_methods/arraystep.py
index 3224380a9f..6d765ca529 100644
--- a/pymc3/step_methods/arraystep.py
+++ b/pymc3/step_methods/arraystep.py
@@ -21,7 +21,7 @@
 from numpy.random import uniform
 
 from pymc3.blocking import DictToArrayBijection, RaveledVars
-from pymc3.model import PyMC3Variable, modelcontext
+from pymc3.model import modelcontext
 from pymc3.step_methods.compound import CompoundStep
 from pymc3.util import get_var_name
 
diff --git a/pymc3/step_methods/gibbs.py b/pymc3/step_methods/gibbs.py
index cde14c9916..49737676cb 100644
--- a/pymc3/step_methods/gibbs.py
+++ b/pymc3/step_methods/gibbs.py
@@ -19,9 +19,6 @@
 """
 from warnings import warn
 
-import aesara.tensor as at
-
-from aesara.graph.basic import graph_inputs
 from numpy import arange, array, cumsum, empty, exp, max, nested_iters, searchsorted
 from numpy.random import uniform
 
diff --git a/pymc3/step_methods/hmc/quadpotential.py b/pymc3/step_methods/hmc/quadpotential.py
index 541821b742..40f542a70f 100644
--- a/pymc3/step_methods/hmc/quadpotential.py
+++ b/pymc3/step_methods/hmc/quadpotential.py
@@ -120,8 +120,8 @@ def raise_ok(self, map_info=None):
 
         Parameters
         ----------
-        vmap: list of blocking.VarMap
-            List of `VarMap`s, which are namedtuples with var, slc, shp, dtyp
+        map_info: List of (name, shape, dtype)
+            List tuples with variable name, shape, and dtype.
 
         Raises
         ------
@@ -245,8 +245,8 @@ def raise_ok(self, map_info):
 
         Parameters
         ----------
-        vmap: List of tuples (var, )
-            List of `VarMap`s, which are namedtuples with var, slc, shp, dtyp
+        map_info: List of (name, shape, dtype)
+            List tuples with variable name, shape, and dtype.
 
         Raises
         ------
diff --git a/pymc3/step_methods/metropolis.py b/pymc3/step_methods/metropolis.py
index 901948600c..13e7f0d84f 100644
--- a/pymc3/step_methods/metropolis.py
+++ b/pymc3/step_methods/metropolis.py
@@ -25,7 +25,6 @@
 
 from pymc3.aesaraf import floatX
 from pymc3.blocking import DictToArrayBijection
-from pymc3.distributions import draw_values
 from pymc3.step_methods.arraystep import (
     ArrayStep,
     ArrayStepShared,
@@ -158,7 +157,8 @@ def __init__(
         vars = pm.inputvars(vars)
 
         if S is None:
-            S = np.ones(sum(initial_values[v.name].size for v in vars))
+            # XXX: This needs to be refactored
+            S = None  # np.ones(sum(v.dsize for v in vars))
 
         if proposal_dist is not None:
             self.proposal_dist = proposal_dist(S)
@@ -177,7 +177,8 @@ def __init__(
 
         # Determine type of variables
         self.discrete = np.concatenate(
-            [[v.dtype in pm.discrete_types] * (initial_values[v.name].size or 1) for v in vars]
+            # XXX: This needs to be refactored
+            None  # [[v.dtype in pm.discrete_types] * (v.dsize or 1) for v in vars]
         )
         self.any_discrete = self.discrete.any()
         self.all_discrete = self.discrete.all()
@@ -409,8 +410,8 @@ def __init__(self, vars, order="random", transit_p=0.8, model=None):
         # transition probabilities
         self.transit_p = transit_p
 
-        initial_point = model.initial_point
-        self.dim = sum(initial_point[v.name].size for v in vars)
+        # XXX: This needs to be refactored
+        self.dim = None  # sum(v.dsize for v in vars)
 
         if order == "random":
             self.shuffle_dims = True
@@ -510,17 +511,17 @@ def __init__(self, vars, proposal="uniform", order="random", model=None):
             distr = getattr(rv_var.owner, "op", None)
 
             if isinstance(distr, CategoricalRV):
-                k_graph = rv_var.owner.inputs[3].shape[-1]
-                (k_graph,), _ = rvs_to_value_vars((k_graph,), apply_transforms=True)
-                k = model.fn(k_graph)(initial_point)
-            elif isinstance(distr, BernoulliRV):
+                # XXX: This needs to be refactored
+                k = None  # draw_values([distr.k])[0]
+            elif isinstance(distr, pm.Bernoulli) or (v.dtype in pm.bool_types):
                 k = 2
             else:
                 raise ValueError(
                     "All variables must be categorical or binary" + "for CategoricalGibbsMetropolis"
                 )
             start = len(dimcats)
-            dimcats += [(dim, k) for dim in range(start, start + v_init_val.size)]
+            # XXX: This needs to be refactored
+            dimcats += None  # [(dim, k) for dim in range(start, start + v.dsize)]
 
         if order == "random":
             self.shuffle_dims = True
diff --git a/pymc3/step_methods/sgmcmc.py b/pymc3/step_methods/sgmcmc.py
index 9fabf9cf62..a3e4262b4d 100644
--- a/pymc3/step_methods/sgmcmc.py
+++ b/pymc3/step_methods/sgmcmc.py
@@ -162,8 +162,8 @@ def __init__(
 
         # This seems to be the only place that `Model.flatten` is used.
         # TODO: Why not _actually_ flatten the variables?
-        # E.g. `flat_vars = at.concatenate([var.ravel() for var in vars])`
-        # or `set_subtensor` the `vars` into a `at.vector`?
+        # E.g. `flat_vars = aet.concatenate([var.ravel() for var in vars])`
+        # or `set_subtensor` the `vars` into a `aet.vector`?
 
         flat_view = model.flatten(vars)
         self.inarray = [flat_view.input]
diff --git a/pymc3/tests/test_distributions_random.py b/pymc3/tests/test_distributions_random.py
index 16b960f1a2..ff3702c252 100644
--- a/pymc3/tests/test_distributions_random.py
+++ b/pymc3/tests/test_distributions_random.py
@@ -30,7 +30,7 @@
 
 from pymc3.aesaraf import change_rv_size, floatX, intX
 from pymc3.distributions.dist_math import clipped_beta_rvs
-from pymc3.distributions.shape_utils import to_tuple
+from pymc3.distributions.distribution import to_tuple
 from pymc3.exceptions import ShapeError
 from pymc3.tests.helpers import SeededTest
 from pymc3.tests.test_distributions import (
@@ -1261,7 +1261,7 @@ def test_mixture_random_shape():
 
     # XXX: This needs to be refactored
     rand0, rand1, rand2, rand3 = [None] * 4  # draw_values(
-    #     [like0, like1, like2, like3], point=m.initial_point, size=100
+    #     [like0, like1, like2, like3], point=m.test_point, size=100
     # )
     assert rand0.shape == (100, 20)
     assert rand1.shape == (100, 20)
@@ -1299,7 +1299,7 @@ def test_mixture_random_shape_fast():
 
     # XXX: This needs to be refactored
     rand0, rand1, rand2, rand3 = [None] * 4  # draw_values(
-    #     [like0, like1, like2, like3], point=m.initial_point, size=100
+    #     [like0, like1, like2, like3], point=m.test_point, size=100
     # )
     assert rand0.shape == (100, 20)
     assert rand1.shape == (100, 20)
diff --git a/pymc3/tests/test_sampling.py b/pymc3/tests/test_sampling.py
index 86d89424f4..5cc7bba127 100644
--- a/pymc3/tests/test_sampling.py
+++ b/pymc3/tests/test_sampling.py
@@ -450,8 +450,8 @@ def test_normal_scalar(self):
 
         with model:
             # test list input
-            ppc0 = pm.sample_posterior_predictive([model.initial_point], samples=10)
-            # # deprecated argument is not introduced to fast version [2019/08/20:rpg]
+            ppc0 = pm.sample_posterior_predictive([model.test_point], samples=10)
+            # deprecated argument is not introduced to fast version [2019/08/20:rpg]
             ppc = pm.sample_posterior_predictive(trace, var_names=["a"])
             # test empty ppc
             ppc = pm.sample_posterior_predictive(trace, var_names=[])
@@ -461,6 +461,11 @@ def test_normal_scalar(self):
             ppc = pm.sample_posterior_predictive(trace, keep_size=True)
             assert ppc["a"].shape == (nchains, ndraws)
 
+            # test keep_size parameter and idata input
+            idata = az.from_pymc3(trace)
+            ppc = pm.sample_posterior_predictive(idata, keep_size=True)
+            assert ppc["a"].shape == (nchains, ndraws)
+
             # test default case
             ppc = pm.sample_posterior_predictive(trace, var_names=["a"])
             assert "a" in ppc
@@ -598,7 +603,6 @@ def test_sum_normal(self):
             _, pval = stats.kstest(ppc["b"], stats.norm(scale=scale).cdf)
             assert pval > 0.001
 
-    @pytest.mark.xfail(reason="HalfFlat not refactored for v4")
     def test_model_not_drawable_prior(self):
         data = np.random.poisson(lam=10, size=200)
         model = pm.Model()
@@ -666,17 +670,6 @@ def test_deterministic_of_observed(self):
 
             rtol = 1e-5 if aesara.config.floatX == "float64" else 1e-4
 
-            model.default_rng.get_value(borrow=True).seed(0)
-            ppc = pm.sample_posterior_predictive(
-                model=model,
-                trace=trace,
-                samples=len(trace) * nchains,
-                random_seed=0,
-                var_names=[var.name for var in (model.deterministics + model.basic_RVs)],
-            )
-
-            npt.assert_allclose(ppc["in_1"] + ppc["in_2"], ppc["out"], rtol=rtol)
-
     def test_deterministic_of_observed_modified_interface(self):
         np.random.seed(4982)
 
@@ -961,8 +954,8 @@ def test_multivariate2(self):
         sim_priors = pm.sample_prior_predictive(samples=20, model=dm_model)
         sim_ppc = pm.sample_posterior_predictive(burned_trace, samples=20, model=dm_model)
         assert sim_priors["probs"].shape == (20, 6)
-        assert sim_priors["obs"].shape == (20,) + mn_data.shape
-        assert sim_ppc["obs"].shape == (20,) + mn_data.shape
+        assert sim_priors["obs"].shape == (20,) + obs.distribution.shape
+        assert sim_ppc["obs"].shape == (20,) + obs.distribution.shape
 
     def test_layers(self):
         with pm.Model() as model:
diff --git a/pymc3/tests/test_variational_inference.py b/pymc3/tests/test_variational_inference.py
index b083e57870..3a5644a7bf 100644
--- a/pymc3/tests/test_variational_inference.py
+++ b/pymc3/tests/test_variational_inference.py
@@ -209,8 +209,8 @@ def parametric_grouped_approxes(request):
 
 @pytest.fixture
 def three_var_aevb_groups(parametric_grouped_approxes, three_var_model, aevb_initial):
-    one_initial_value = three_var_model.initial_point[three_var_model.one.tag.value_var.name]
-    dsize = np.prod(one_initial_value.shape[1:])
+    # XXX: This needs to be refactored
+    dsize = None  # np.prod(pymc3.util.get_transformed(three_var_model.one).dshape[1:])
     cls, kw = parametric_grouped_approxes
     spec = cls.get_param_spec_for(d=dsize, **kw)
     params = dict()
diff --git a/pymc3/util.py b/pymc3/util.py
index 4db6041195..4bf929e36d 100644
--- a/pymc3/util.py
+++ b/pymc3/util.py
@@ -257,11 +257,7 @@ def get_repr_for_variable(variable, formatting="plain"):
 
 
 def get_var_name(var):
-    """Get an appropriate, plain variable name for a variable. Necessary
-    because we override aesara.tensor.var.TensorVariable.__str__ to give informative
-    string representations to our pymc3.PyMC3Variables, yet we want to use the
-    plain name as e.g. keys in dicts.
-    """
+    """Get an appropriate, plain variable name for a variable."""
     if isinstance(var, TensorVariable):
         return super(TensorVariable, var).__str__()
     else:
diff --git a/pymc3/variational/opvi.py b/pymc3/variational/opvi.py
index 024b121a91..f2fe93530c 100644
--- a/pymc3/variational/opvi.py
+++ b/pymc3/variational/opvi.py
@@ -59,7 +59,6 @@
 
 from pymc3.aesaraf import at_rng, identity
 from pymc3.backends import NDArray
-from pymc3.blocking import ArrayOrdering, VarMap
 from pymc3.model import modelcontext
 from pymc3.util import (
     WithMemoization,
@@ -960,7 +959,7 @@ def __init_group__(self, group):
         self.group = [get_transformed(var) for var in self.group]
 
         # XXX: This needs to be refactored
-        self.ordering = ArrayOrdering([])
+        # self.ordering = ArrayOrdering([])
         self.replacements = dict()
         for var in self.group:
             if var.type.numpy_dtype.name in discrete_types:

From b5065dce98a63484260f05b6bd800f11eaf09dfa Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Sat, 13 Feb 2021 17:26:44 -0600
Subject: [PATCH 04/44] Refactor tests for compatibility with logp dispatch and
 RandomVariables

---
 pymc3/model.py                               |   4 +-
 pymc3/tests/models.py                        |  29 +--
 pymc3/tests/test_coords.py                   |  21 ++
 pymc3/tests/test_data_container.py           |   4 +-
 pymc3/tests/test_dist_math.py                |   2 +
 pymc3/tests/test_distribution_defaults.py    |  92 +++++++++
 pymc3/tests/test_distributions.py            | 197 ++++++++++++-------
 pymc3/tests/test_distributions_random.py     |   4 +
 pymc3/tests/test_distributions_timeseries.py |   3 +-
 pymc3/tests/test_examples.py                 |   2 +-
 pymc3/tests/test_gp.py                       |   2 +
 pymc3/tests/test_hmc.py                      |   2 +
 pymc3/tests/test_minibatches.py              |   4 +-
 pymc3/tests/test_missing.py                  |  26 ++-
 pymc3/tests/test_model.py                    |   1 -
 pymc3/tests/test_model_helpers.py            |  16 +-
 16 files changed, 308 insertions(+), 101 deletions(-)
 create mode 100644 pymc3/tests/test_coords.py
 create mode 100644 pymc3/tests/test_distribution_defaults.py

diff --git a/pymc3/model.py b/pymc3/model.py
index d6181eae86..e649b2eb08 100644
--- a/pymc3/model.py
+++ b/pymc3/model.py
@@ -1343,7 +1343,7 @@ def fastfn(outs, mode=None, model=None):
     return model.fastfn(outs, mode)
 
 
-def Point(*args, filter_model_vars=False, **kwargs):
+def Point(*args, filter_model_vars=True, **kwargs):
     """Build a point. Uses same args as dict() does.
     Filters out variables not in the model. All keys are strings.
 
@@ -1361,7 +1361,7 @@ def Point(*args, filter_model_vars=False, **kwargs):
     return {
         get_var_name(k): np.array(v)
         for k, v in d.items()
-        if not filter_model_vars or (get_var_name(k) in map(get_var_name, model.value_vars))
+        if not filter_model_vars or (get_var_name(k) in map(get_var_name, model.vars))
     }
 
 
diff --git a/pymc3/tests/models.py b/pymc3/tests/models.py
index 0289386e54..3d4b471e7a 100644
--- a/pymc3/tests/models.py
+++ b/pymc3/tests/models.py
@@ -30,7 +30,7 @@ def simple_model():
     mu = -2.1
     tau = 1.3
     with Model() as model:
-        Normal("x", mu, tau=tau, size=2, testval=floatX_array([0.1, 0.1]))
+        Normal("x", mu, tau=tau, size=2, testval=np.ones(2) * 0.1)
 
     return model.initial_point, model, (mu, tau ** -0.5)
 
@@ -92,9 +92,9 @@ def simple_2model_continuous():
     tau = 1.3
     with Model() as model:
         x = pm.Normal("x", mu, tau=tau, testval=0.1)
-        pm.Deterministic("logx", at.log(x))
+        pm.Deterministic("logx", aet.log(x))
         pm.Beta("y", alpha=1, beta=1, size=2)
-    return model.initial_point, model
+    return model.test_point, model
 
 
 def mv_simple():
@@ -104,8 +104,9 @@ def mv_simple():
     with pm.Model() as model:
         pm.MvNormal(
             "x",
-            at.constant(mu),
-            tau=at.constant(tau),
+            aet.constant(mu),
+            tau=aet.constant(tau),
+            size=3,
             testval=floatX_array([0.1, 1.0, 0.8]),
         )
     H = tau
@@ -120,8 +121,9 @@ def mv_simple_coarse():
     with pm.Model() as model:
         pm.MvNormal(
             "x",
-            at.constant(mu),
-            tau=at.constant(tau),
+            aet.constant(mu),
+            tau=aet.constant(tau),
+            size=3,
             testval=floatX_array([0.1, 1.0, 0.8]),
         )
     H = tau
@@ -136,8 +138,9 @@ def mv_simple_very_coarse():
     with pm.Model() as model:
         pm.MvNormal(
             "x",
-            at.constant(mu),
-            tau=at.constant(tau),
+            aet.constant(mu),
+            tau=aet.constant(tau),
+            size=3,
             testval=floatX_array([0.1, 1.0, 0.8]),
         )
     H = tau
@@ -150,7 +153,7 @@ def mv_simple_discrete():
     n = 5
     p = floatX_array([0.15, 0.85])
     with pm.Model() as model:
-        pm.Multinomial("x", n, at.constant(p), testval=np.array([1, 4]))
+        pm.Multinomial("x", n, aet.constant(p), size=d, testval=np.array([1, 4]))
         mu = n * p
         # covariance matrix
         C = np.zeros((d, d))
@@ -184,7 +187,7 @@ def mv_prior_simple():
 
     with pm.Model() as model:
         x = pm.Flat("x", size=n)
-        x_obs = pm.MvNormal("x_obs", observed=obs, mu=x, cov=noise * np.eye(n))
+        x_obs = pm.MvNormal("x_obs", observed=obs, mu=x, cov=noise * np.eye(n), size=n)
 
     return model.initial_point, model, (K, L, mu_post, std_post, noise)
 
@@ -192,14 +195,14 @@ def mv_prior_simple():
 def non_normal(n=2):
     with pm.Model() as model:
         pm.Beta("x", 3, 3, size=n, transform=None)
-    return model.initial_point, model, (np.tile([0.5], n), None)
+    return model.test_point, model, (np.tile([0.5], n), None)
 
 
 def exponential_beta(n=2):
     with pm.Model() as model:
         pm.Beta("x", 3, 1, size=n, transform=None)
         pm.Exponential("y", 1, size=n, transform=None)
-    return model.initial_point, model, None
+    return model.test_point, model, None
 
 
 def beta_bernoulli(n=2):
diff --git a/pymc3/tests/test_coords.py b/pymc3/tests/test_coords.py
new file mode 100644
index 0000000000..f8ba32dafa
--- /dev/null
+++ b/pymc3/tests/test_coords.py
@@ -0,0 +1,21 @@
+import numpy as np
+import pytest
+
+import pymc3 as pm
+
+
+@pytest.mark.xfail("Arviz incompatibilities")
+def test_coords():
+    chains = 2
+    n_features = 3
+    n_samples = 10
+
+    coords = {"features": np.arange(n_features)}
+
+    with pm.Model(coords=coords):
+        a = pm.Uniform("a", -100, 100, dims="features")
+        b = pm.Uniform("b", -100, 100, dims="features")
+        tr = pm.sample(n_samples, chains=chains, return_inferencedata=True)
+
+    assert "features" in tr.posterior.a.coords.dims
+    assert "features" in tr.posterior.b.coords.dims
diff --git a/pymc3/tests/test_data_container.py b/pymc3/tests/test_data_container.py
index 88a1432d48..d6252470c6 100644
--- a/pymc3/tests/test_data_container.py
+++ b/pymc3/tests/test_data_container.py
@@ -182,7 +182,7 @@ def test_shared_scalar_as_rv_input(self):
             v = pm.Normal("v", mu=shared_var, size=1)
 
         np.testing.assert_allclose(
-            logpt(v, np.r_[5.0]).eval(),
+            logpt(v, 5.0).eval(),
             -0.91893853,
             rtol=1e-5,
         )
@@ -190,7 +190,7 @@ def test_shared_scalar_as_rv_input(self):
         shared_var.set_value(10.0)
 
         np.testing.assert_allclose(
-            logpt(v, np.r_[10.0]).eval(),
+            logpt(v, 10.0).eval(),
             -0.91893853,
             rtol=1e-5,
         )
diff --git a/pymc3/tests/test_dist_math.py b/pymc3/tests/test_dist_math.py
index fc1e531a00..d4285f06c6 100644
--- a/pymc3/tests/test_dist_math.py
+++ b/pymc3/tests/test_dist_math.py
@@ -125,6 +125,7 @@ def logp(value, n, p):
         )
 
 
+@pytest.mark.xfail(reason="This test relies on the deprecated Distribution interface")
 def test_multinomial_bound():
 
     x = np.array([1, 5])
@@ -143,6 +144,7 @@ def test_multinomial_bound():
     )
 
 
+@pytest.mark.xfail(reason="MvNormal not implemented")
 class TestMvNormalLogp:
     def test_logp(self):
         np.random.seed(42)
diff --git a/pymc3/tests/test_distribution_defaults.py b/pymc3/tests/test_distribution_defaults.py
new file mode 100644
index 0000000000..4d0ecfe8b2
--- /dev/null
+++ b/pymc3/tests/test_distribution_defaults.py
@@ -0,0 +1,92 @@
+#   Copyright 2020 The PyMC Developers
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+import numpy as np
+import pytest
+
+from pymc3.distributions import Categorical, Continuous, DiscreteUniform
+from pymc3.model import Model
+
+pytestmark = pytest.mark.xfail(reason="This test relies on the deprecated Distribution interface")
+
+
+class DistTest(Continuous):
+    def __init__(self, a, b, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.a = a
+        self.b = b
+
+    def logp(self, v):
+        return 0
+
+
+def test_default_nan_fail():
+    with Model(), pytest.raises(AttributeError):
+        DistTest("x", np.nan, 2, defaults=["a"])
+
+
+def test_default_empty_fail():
+    with Model(), pytest.raises(AttributeError):
+        DistTest("x", 1, 2, defaults=[])
+
+
+def test_default_testval():
+    with Model():
+        x = DistTest("x", 1, 2, testval=5, defaults=[])
+        assert x.tag.test_value == 5
+
+
+def test_default_testval_nan():
+    with Model():
+        x = DistTest("x", 1, 2, testval=np.nan, defaults=["a"])
+        np.testing.assert_almost_equal(x.tag.test_value, np.nan)
+
+
+def test_default_a():
+    with Model():
+        x = DistTest("x", 1, 2, defaults=["a"])
+        assert x.tag.test_value == 1
+
+
+def test_default_b():
+    with Model():
+        x = DistTest("x", np.nan, 2, defaults=["a", "b"])
+        assert x.tag.test_value == 2
+
+
+def test_default_c():
+    with Model():
+        y = DistTest("y", 7, 8, testval=94)
+        x = DistTest("x", y, 2, defaults=["a", "b"])
+        assert x.tag.test_value == 94
+
+
+def test_default_discrete_uniform():
+    with Model():
+        x = DiscreteUniform("x", lower=1, upper=2)
+        assert x.init_value == 1
+
+
+def test_discrete_uniform_negative():
+    model = Model()
+    with model:
+        x = DiscreteUniform("x", lower=-10, upper=0)
+    assert model.test_point["x"] == -5
+
+
+def test_categorical_mode():
+    model = Model()
+    with model:
+        x = Categorical("x", p=np.eye(4), shape=4)
+    assert np.allclose(model.test_point["x"], np.arange(4))
diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
index aa838d18df..d685eaec5d 100644
--- a/pymc3/tests/test_distributions.py
+++ b/pymc3/tests/test_distributions.py
@@ -226,16 +226,10 @@ def build_model(distfam, valuedomain, vardomains, extra_args=None):
     with Model() as m:
         param_vars = {}
         for v, dom in vardomains.items():
-            v_at = aesara.shared(np.asarray(dom.vals[0]))
-            v_at.name = v
-            param_vars[v] = v_at
-        param_vars.update(extra_args)
-        distfam(
-            "value",
-            **param_vars,
-            transform=None,
-        )
-    return m, param_vars
+            vals[v] = dom.vals[0]
+        vals.update(extra_args)
+        distfam("value", size=valuedomain.shape, transform=None, **vals)
+    return m
 
 
 def laplace_asymmetric_logpdf(value, kappa, b, mu):
@@ -648,8 +642,7 @@ def logp_reference(args):
         domains["value"] = domain
         for pt in product(domains, n_samples=n_samples):
             pt = dict(pt)
-            pt_d = self._model_input_dict(model, param_vars, pt)
-            pt_logp = Point(pt_d, model=model)
+            pt_logp = Point(pt, model=model)
             pt_ref = Point(pt, filter_model_vars=False, model=model)
             assert_almost_equal(
                 logp(pt_logp),
@@ -752,7 +745,7 @@ def check_logcdf(
                 with Model() as m:
                     dist = pymc3_dist("y", **params)
                 params["value"] = value  # for displaying in err_msg
-                with aesara.config.change_flags(on_opt_error="raise", mode=Mode("py")):
+                with aesara.config.change_flags(mode=Mode("py")):
                     assert_almost_equal(
                         logcdf(dist, value).eval(),
                         scipy_cdf,
@@ -783,12 +776,7 @@ def check_logcdf(
                     if invalid_edge is not None:
                         test_params = valid_params.copy()  # Shallow copy should be okay
                         test_params[invalid_param] = invalid_edge
-                        # We need to remove `Assert`s introduced by checks like
-                        # `assert_negative_support` and disable test values;
-                        # otherwise, we won't be able to create the
-                        # `RandomVariable`
-                        with aesara.config.change_flags(compute_test_value="off"):
-                            invalid_dist = pymc3_dist.dist(**test_params)
+                        invalid_dist = pymc3_dist.dist(**test_params)
                         with aesara.config.change_flags(mode=Mode("py")):
                             assert_equal(
                                 logcdf(invalid_dist, valid_value).eval(),
@@ -817,8 +805,14 @@ def check_logcdf(
                 )
 
         # Test that method works with multiple values or raises informative TypeError
-        with pytest.raises(TypeError), aesara.config.change_flags(mode=Mode("py")):
-            logcdf(valid_dist, np.array([valid_value, valid_value])).eval()
+        try:
+            with aesara.config.change_flags(mode=Mode("py")):
+                logcdf(valid_dist, np.array([valid_value, valid_value])).eval()
+        except TypeError as err:
+            if not str(err).endswith(
+                ".logcdf expects a scalar value but received a 1-dimensional object."
+            ):
+                raise
 
     def check_selfconsistency_discrete_logcdf(
         self, distribution, domain, paramdomains, decimal=None, n_samples=100
@@ -835,13 +829,10 @@ def check_selfconsistency_discrete_logcdf(
             value = params.pop("value")
             values = np.arange(domain.lower, value + 1)
             dist = distribution.dist(**params)
-            # This only works for scalar random variables
-            assert dist.owner.op.ndim_supp == 0
-            values_dist = change_rv_size(dist, values.shape)
             with aesara.config.change_flags(mode=Mode("py")):
                 assert_almost_equal(
                     logcdf(dist, value).eval(),
-                    logsumexp(logpt(values_dist, values), keepdims=False).eval(),
+                    logsumexp(logpt(dist, values), keepdims=False).eval(),
                     decimal=decimal,
                     err_msg=str(pt),
                 )
@@ -883,8 +874,8 @@ def test_uniform(self):
         invalid_dist = Uniform.dist(lower=1, upper=0)
 
         with aesara.config.change_flags(mode=Mode("py")):
-            assert logpt(invalid_dist, np.array(0.5)).eval() == -np.inf
-            assert logcdf(invalid_dist, np.array(2.0)).eval() == -np.inf
+            assert logpt(invalid_dist, 0.5).eval() == -np.inf
+            assert logcdf(invalid_dist, 2).eval() == -np.inf
 
     @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_triangular(self):
@@ -1005,6 +996,7 @@ def scipy_logp(value, mu, sigma, lower, upper):
             decimal=select_by_precision(float64=6, float32=1),
         )
 
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_half_normal(self):
         self.check_logp(
             HalfNormal,
@@ -1083,6 +1075,28 @@ def test_wald_logp_custom_points(self, value, mu, lam, phi, alpha, logp):
         decimals = select_by_precision(float64=6, float32=1)
         assert_almost_equal(model.fastlogp(pt), logp, decimal=decimals, err_msg=str(pt))
 
+    def test_wald_logp(self):
+        self.check_logp(
+            Wald,
+            Rplus,
+            {"mu": Rplus, "alpha": Rplus},
+            lambda value, mu, alpha: sp.invgauss.logpdf(value, mu=mu, loc=alpha),
+            decimal=select_by_precision(float64=6, float32=1),
+        )
+
+    @pytest.mark.xfail(
+        condition=(aesara.config.floatX == "float32"),
+        reason="Poor CDF in SciPy. See scipy/scipy#869 for details.",
+    )
+    def test_wald_logcdf(self):
+        self.check_logcdf(
+            Wald,
+            Rplus,
+            {"mu": Rplus, "alpha": Rplus},
+            lambda value, mu, alpha: sp.invgauss.logcdf(value, mu=mu, loc=alpha),
+        )
+
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_beta(self):
         self.check_logp(
             Beta,
@@ -1110,6 +1124,7 @@ def scipy_log_pdf(value, a, b):
 
         self.check_logp(Kumaraswamy, Unit, {"a": Rplus, "b": Rplus}, scipy_log_pdf)
 
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_exponential(self):
         self.check_logp(
             Exponential,
@@ -1181,6 +1196,7 @@ def modified_scipy_hypergeom_logcdf(value, N, k, n):
             {"N": NatSmall, "k": NatSmall, "n": NatSmall},
         )
 
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_negative_binomial(self):
         def scipy_mu_alpha_logpmf(value, mu, alpha):
             return sp.nbinom.logpmf(value, alpha, 1 - mu / (mu + alpha))
@@ -1237,6 +1253,7 @@ def scipy_mu_alpha_logcdf(value, mu, alpha):
             (5, 0.5, None, 2, "Can't specify both mu and p."),
         ],
     )
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_negative_binomial_init_fail(self, mu, p, alpha, n, expected):
         with Model():
             with pytest.raises(ValueError, match=f"Incompatible parametrization. {expected}"):
@@ -1297,6 +1314,7 @@ def test_t(self):
             n_samples=10,
         )
 
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_cauchy(self):
         self.check_logp(
             Cauchy,
@@ -1311,6 +1329,7 @@ def test_cauchy(self):
             lambda value, alpha, beta: sp.cauchy.logcdf(value, alpha, beta),
         )
 
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_half_cauchy(self):
         self.check_logp(
             HalfCauchy,
@@ -1359,6 +1378,11 @@ def test_gamma_logcdf(self):
             skip_paramdomain_outside_edge_test=True,
         )
 
+    @pytest.mark.xfail(
+        condition=(aesara.config.floatX == "float32"),
+        reason="Fails on float32 due to numerical issues",
+    )
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_inverse_gamma_logp(self):
         self.check_logp(
             InverseGamma,
@@ -1389,6 +1413,7 @@ def test_inverse_gamma_logcdf(self):
         condition=(aesara.config.floatX == "float32"),
         reason="Fails on float32 due to scaling issues",
     )
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_inverse_gamma_alt_params(self):
         def test_fun(value, mu, sigma):
             alpha, beta = InverseGamma._get_alpha_beta(None, None, mu, sigma)
@@ -1417,6 +1442,10 @@ def test_pareto(self):
             lambda value, alpha, m: sp.pareto.logcdf(value, alpha, scale=m),
         )
 
+    @pytest.mark.xfail(
+        condition=(aesara.config.floatX == "float32"),
+        reason="Fails on float32 due to inf issues",
+    )
     @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_weibull_logp(self):
         self.check_logp(
@@ -1459,6 +1488,7 @@ def test_skew_normal(self):
             decimal=select_by_precision(float64=5, float32=3),
         )
 
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_binomial(self):
         self.check_logp(
             Binomial,
@@ -1483,6 +1513,10 @@ def test_binomial(self):
     # Too lazy to propagate decimal parameter through the whole chain of deps
     @pytest.mark.xfail(reason="Distribution not refactored yet")
     @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
+    @pytest.mark.xfail(
+        condition=(SCIPY_VERSION < parse("1.4.0")), reason="betabinom is new in Scipy 1.4.0"
+    )
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_beta_binomial_distribution(self):
         self.checkd(
             BetaBinomial,
@@ -1523,8 +1557,8 @@ def test_beta_binomial_selfconsistency(self):
             {"alpha": Rplus, "beta": Rplus, "n": NatSmall},
         )
 
-    @pytest.mark.xfail(reason="Bernoulli logit_p not refactored yet")
-    def test_bernoulli_logit_p(self):
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
+    def test_bernoulli(self):
         self.check_logp(
             Bernoulli,
             Bool,
@@ -1571,6 +1605,7 @@ def test_discrete_weibull(self):
             {"q": Unit, "beta": Rplusdunif},
         )
 
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_poisson(self):
         self.check_logp(
             Poisson,
@@ -1609,11 +1644,8 @@ def test_constantdist(self):
         self.check_logp(Constant, I, {"c": I}, lambda value, c: np.log(c == value))
 
     # Too lazy to propagate decimal parameter through the whole chain of deps
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     @pytest.mark.xfail(reason="Distribution not refactored yet")
-    @pytest.mark.xfail(
-        condition=(aesara.config.floatX == "float32"),
-        reason="Fails on float32 due to inf issues",
-    )
     def test_zeroinflatedpoisson_distribution(self):
         self.checkd(
             ZeroInflatedPoisson,
@@ -1630,11 +1662,8 @@ def test_zeroinflatedpoisson_logcdf(self):
         )
 
     # Too lazy to propagate decimal parameter through the whole chain of deps
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     @pytest.mark.xfail(reason="Distribution not refactored yet")
-    @pytest.mark.xfail(
-        condition=(aesara.config.floatX == "float32"),
-        reason="Fails on float32 due to inf issues",
-    )
     def test_zeroinflatednegativebinomial_distribution(self):
         self.checkd(
             ZeroInflatedNegativeBinomial,
@@ -1652,6 +1681,7 @@ def test_zeroinflatednegativebinomial_logcdf(self):
         )
 
     # Too lazy to propagate decimal parameter through the whole chain of deps
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_zeroinflatedbinomial_distribution(self):
         self.checkd(
@@ -1725,6 +1755,7 @@ def test_mvnormal(self, n):
         condition=(aesara.config.floatX == "float32"),
         reason="Fails on float32 due to inf issues",
     )
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_mvnormal_indef(self):
         cov_val = np.array([[1, 0.5], [0.5, -2]])
         cov = at.matrix("cov")
@@ -1739,13 +1770,14 @@ def test_mvnormal_indef(self):
         f_dlogp = aesara.function([cov, x], dlogp)
         assert not np.all(np.isfinite(f_dlogp(cov_val, np.ones(2))))
 
-        logp = logpt(MvNormal.dist(mu=mu, tau=cov), x)
+        logp = logp(MvNormal.dist(mu=mu, tau=cov), x)
         f_logp = aesara.function([cov, x], logp)
         assert f_logp(cov_val, np.ones(2)) == -np.inf
         dlogp = at.grad(logp, cov)
         f_dlogp = aesara.function([cov, x], dlogp)
         assert not np.all(np.isfinite(f_dlogp(cov_val, np.ones(2))))
 
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_mvnormal_init_fail(self):
         with Model():
             with pytest.raises(ValueError):
@@ -1930,18 +1962,7 @@ def test_dirichlet_with_batch_shapes(self, dist_shape):
         with pm.Model() as model:
             d = pm.Dirichlet("d", a=a)
 
-        # Generate sample points to test
-        d_value = d.tag.value_var
-        d_point = d.eval().astype("float64")
-        d_point /= d_point.sum(axis=-1)[..., None]
-
-        if hasattr(d_value.tag, "transform"):
-            d_point_trans = d_value.tag.transform.forward(d, at.as_tensor(d_point)).eval()
-        else:
-            d_point_trans = d_point
-
-        pymc3_res = logpt(d, d_point_trans, jacobian=False).eval()
-        scipy_res = np.empty_like(pymc3_res)
+        pymc3_res = logpt(d, d.tag.test_value).eval()
         for idx in np.ndindex(a.shape[:-1]):
             scipy_res[idx] = scipy.stats.dirichlet(a[idx]).logpdf(d_point[idx])
 
@@ -1964,6 +1985,7 @@ def test_dirichlet_2D(self):
         )
 
     @pytest.mark.parametrize("n", [2, 3])
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_multinomial(self, n):
         self.check_logp(
             Multinomial, Vector(Nat, n), {"p": Simplex(n), "n": Nat}, multinomial_logpdf
@@ -1978,6 +2000,7 @@ def test_multinomial(self, n):
             [[0.3, 0.6, 0.05, 0.05], 10],
         ],
     )
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_multinomial_mode(self, p, n):
         _p = np.array(p)
         with Model() as model:
@@ -2008,14 +2031,14 @@ def test_multinomial_mode(self, p, n):
             [[[0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25]], (2, 4), [17, 19]],
         ],
     )
-    def test_multinomial_random(self, p, size, n):
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
+    def test_multinomial_random(self, p, shape, n):
         p = np.asarray(p)
         with Model() as model:
-            m = Multinomial("m", n=n, p=p, size=size)
+            m = Multinomial("m", n=n, p=p, size=shape)
+        m.random()
 
-        assert m.eval().shape == size + p.shape
-
-    @pytest.mark.skip(reason="Moment calculations have not been refactored yet")
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_multinomial_mode_with_shape(self):
         n = [1, 10]
         p = np.asarray([[0.25, 0.25, 0.25, 0.25], [0.26, 0.26, 0.26, 0.22]])
@@ -2023,16 +2046,17 @@ def test_multinomial_mode_with_shape(self):
             m = Multinomial("m", n=n, p=p, size=(2, 4))
         assert_allclose(m.distribution.mode.eval().sum(axis=-1), n)
 
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_multinomial_vec(self):
         vals = np.array([[2, 4, 4], [3, 3, 4]])
         p = np.array([0.2, 0.3, 0.5])
         n = 10
 
         with Model() as model_single:
-            Multinomial("m", n=n, p=p)
+            Multinomial("m", n=n, p=p, size=len(p))
 
         with Model() as model_many:
-            Multinomial("m", n=n, p=p, size=2)
+            Multinomial("m", n=n, p=p, size=vals.shape)
 
         assert_almost_equal(
             scipy.stats.multinomial.logpmf(vals, n, p),
@@ -2052,13 +2076,14 @@ def test_multinomial_vec(self):
             decimal=4,
         )
 
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_multinomial_vec_1d_n(self):
         vals = np.array([[2, 4, 4], [4, 3, 4]])
         p = np.array([0.2, 0.3, 0.5])
         ns = np.array([10, 11])
 
         with Model() as model:
-            Multinomial("m", n=ns, p=p)
+            Multinomial("m", n=ns, p=p, size=vals.shape)
 
         assert_almost_equal(
             sum([multinomial_logpdf(val, n, p) for val, n in zip(vals, ns)]),
@@ -2066,13 +2091,14 @@ def test_multinomial_vec_1d_n(self):
             decimal=4,
         )
 
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_multinomial_vec_1d_n_2d_p(self):
         vals = np.array([[2, 4, 4], [4, 3, 4]])
         ps = np.array([[0.2, 0.3, 0.5], [0.9, 0.09, 0.01]])
         ns = np.array([10, 11])
 
         with Model() as model:
-            Multinomial("m", n=ns, p=ps)
+            Multinomial("m", n=ns, p=ps, size=vals.shape)
 
         assert_almost_equal(
             sum([multinomial_logpdf(val, n, p) for val, n, p in zip(vals, ns, ps)]),
@@ -2080,13 +2106,14 @@ def test_multinomial_vec_1d_n_2d_p(self):
             decimal=4,
         )
 
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_multinomial_vec_2d_p(self):
         vals = np.array([[2, 4, 4], [3, 3, 4]])
         ps = np.array([[0.2, 0.3, 0.5], [0.3, 0.3, 0.4]])
         n = 10
 
         with Model() as model:
-            Multinomial("m", n=n, p=ps)
+            Multinomial("m", n=n, p=ps, size=vals.shape)
 
         assert_almost_equal(
             sum([multinomial_logpdf(val, n, p) for val, p in zip(vals, ps)]),
@@ -2094,6 +2121,7 @@ def test_multinomial_vec_2d_p(self):
             decimal=4,
         )
 
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_batch_multinomial(self):
         n = 10
         vals = np.zeros((4, 5, 3), dtype="int32")
@@ -2102,11 +2130,10 @@ def test_batch_multinomial(self):
         np.put_along_axis(vals, inds, n, axis=-1)
         np.put_along_axis(p, inds, 1, axis=-1)
 
-        dist = Multinomial.dist(n=n, p=p)
-
-        value = at.tensor3(dtype="int32")
+        dist = Multinomial.dist(n=n, p=p, size=vals.shape)
+        value = aet.tensor3(dtype="int32")
         value.tag.test_value = np.zeros_like(vals, dtype="int32")
-        logp = at.exp(logpt(dist, value))
+        logp = aet.exp(logpt(dist, value))
         f = aesara.function(inputs=[value], outputs=logp)
         assert_almost_equal(
             f(vals),
@@ -2144,6 +2171,24 @@ def test_dirichlet_multinomial_matches_beta_binomial(self):
             decimal=select_by_precision(float64=6, float32=3),
         )
 
+    @pytest.mark.parametrize(
+        "a, n, shape",
+        [
+            [[0.25, 0.25, 0.25, 0.25], 1, (1, 4)],
+            [[0.3, 0.6, 0.05, 0.05], 2, (1, 4)],
+            [[0.3, 0.6, 0.05, 0.05], 10, (1, 4)],
+            [[0.25, 0.25, 0.25, 0.25], 1, (2, 4)],
+            [[0.3, 0.6, 0.05, 0.05], 2, (3, 4)],
+            [[[0.25, 0.25, 0.25, 0.25], [0.26, 0.26, 0.26, 0.22]], [1, 10], (2, 4)],
+        ],
+    )
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
+    def test_dirichlet_multinomial_defaultval(self, a, n, shape):
+        a = np.asarray(a)
+        with Model() as model:
+            m = DirichletMultinomial("m", n=n, a=a, size=shape)
+        assert_allclose(m.distribution._defaultval.eval().sum(axis=-1), n)
+
     @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_dirichlet_multinomial_vec(self):
         vals = np.array([[2, 4, 4], [3, 3, 4]])
@@ -2456,6 +2501,7 @@ def test_rice(self):
             lambda value, b, sigma: sp.rice.logpdf(value, b=b, loc=0, scale=sigma),
         )
 
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_moyal_logp(self):
         # Using a custom domain, because the standard `R` domain undeflows with scipy in float64
@@ -2516,19 +2562,22 @@ def test_bound():
     LowerNormal = Bound(Normal, lower=1)
     dist = LowerNormal.dist(mu=0, sigma=1)
     assert logpt(dist, 0).eval() == -np.inf
-    # assert dist.transform is not None
+    assert dist.default() > 1
+    assert dist.transform is not None
     assert np.all(dist.random() > 1)
 
     UpperNormal = Bound(Normal, upper=-1)
     dist = UpperNormal.dist(mu=0, sigma=1)
     assert logpt(dist, -0.5).eval() == -np.inf
-    # assert dist.transform is not None
+    assert dist.default() < -1
+    assert dist.transform is not None
     assert np.all(dist.random() < -1)
 
     ArrayNormal = Bound(Normal, lower=[1, 2], upper=[2, 3])
     dist = ArrayNormal.dist(mu=0, sigma=1, size=2)
     assert_equal(logpt(dist, [0.5, 3.5]).eval(), -np.array([np.inf, np.inf]))
-    # assert dist.transform is not None
+    assert_equal(dist.default(), np.array([1.5, 2.5]))
+    assert dist.transform is not None
     with pytest.raises(ValueError) as err:
         dist.random()
     err.match("Drawing samples from distributions with array-valued")
@@ -2724,6 +2773,7 @@ def test_str(self):
             assert str_repr in model_str
 
 
+@pytest.mark.xfail(reason="Distribution not refactored yet")
 def test_discrete_trafo():
     with Model():
         with pytest.raises(ValueError) as err:
@@ -2834,3 +2884,16 @@ def func(x):
     import pickle
 
     pickle.loads(pickle.dumps(y))
+
+
+def test_hierarchical_logpt():
+    with pm.Model() as m:
+        x = pm.Uniform("x", lower=0, upper=1)
+        y = pm.Uniform("y", lower=0, upper=x)
+
+    # Make sure that hierarchical random variables are replaced with their
+    # log-likelihood space variables in the log-likelhood
+    logpt_ancestors = list(ancestors([m.logpt]))
+    assert not any(isinstance(v.owner.op, RandomVariable) for v in logpt_ancestors if v.owner)
+    assert x.tag.value_var in logpt_ancestors
+    assert y.tag.value_var in logpt_ancestors
diff --git a/pymc3/tests/test_distributions_random.py b/pymc3/tests/test_distributions_random.py
index ff3702c252..b5cec04986 100644
--- a/pymc3/tests/test_distributions_random.py
+++ b/pymc3/tests/test_distributions_random.py
@@ -55,6 +55,10 @@
     product,
 )
 
+# XXX: This test module will need to be repurposed as tests for new
+# `RandomVariable`s and their `RandomVariable.perform` methods.
+pytestmark = pytest.mark.xfail(reason="This test relies on the deprecated Distribution interface")
+
 
 def pymc3_random(
     dist,
diff --git a/pymc3/tests/test_distributions_timeseries.py b/pymc3/tests/test_distributions_timeseries.py
index 5f9ec3485d..fd3274e2e5 100644
--- a/pymc3/tests/test_distributions_timeseries.py
+++ b/pymc3/tests/test_distributions_timeseries.py
@@ -22,9 +22,10 @@
 from pymc3.sampling import sample, sample_posterior_predictive
 from pymc3.tests.helpers import select_by_precision
 
-# pytestmark = pytest.mark.usefixtures("seeded_test")
 pytestmark = pytest.mark.xfail(reason="This test relies on the deprecated Distribution interface")
 
+pytestmark = pytest.mark.usefixtures("seeded_test")
+
 
 def test_AR():
     # AR1
diff --git a/pymc3/tests/test_examples.py b/pymc3/tests/test_examples.py
index b79f9eaacb..8ef45019d5 100644
--- a/pymc3/tests/test_examples.py
+++ b/pymc3/tests/test_examples.py
@@ -241,7 +241,7 @@ def test_run(self):
             pm.sample(50, pm.Slice(), start=start)
 
 
-@pytest.mark.xfail(reason="ZeroInflatedPoisson hasn't been refactored for v4")
+@pytest.mark.xfail(reason="Metropolis samplers haven't been refactored")
 class TestLatentOccupancy(SeededTest):
     """
     From the PyMC example list
diff --git a/pymc3/tests/test_gp.py b/pymc3/tests/test_gp.py
index bb85ef83ea..461096906c 100644
--- a/pymc3/tests/test_gp.py
+++ b/pymc3/tests/test_gp.py
@@ -26,6 +26,8 @@
 
 from pymc3.math import cartesian, kronecker
 
+pytestmark = pytest.mark.xfail(reason="GP not refactored")
+
 np.random.seed(101)
 
 
diff --git a/pymc3/tests/test_hmc.py b/pymc3/tests/test_hmc.py
index 68585a178a..f25081a8c7 100644
--- a/pymc3/tests/test_hmc.py
+++ b/pymc3/tests/test_hmc.py
@@ -15,6 +15,7 @@
 
 import numpy as np
 import numpy.testing as npt
+import pytest
 
 import pymc3
 
@@ -26,6 +27,7 @@
 logger = logging.getLogger("pymc3")
 
 
+@pytest.mark.xfail(reason="Beta not refactored")
 def test_leapfrog_reversible():
     n = 3
     np.random.seed(42)
diff --git a/pymc3/tests/test_minibatches.py b/pymc3/tests/test_minibatches.py
index 64a8cbc42d..762447c421 100644
--- a/pymc3/tests/test_minibatches.py
+++ b/pymc3/tests/test_minibatches.py
@@ -208,12 +208,12 @@ def test_gradient_with_scaling(self):
             genvar = generator(gen1())
             m = Normal("m")
             Normal("n", observed=genvar, total_size=1000)
-            grad1 = aesara.function([m.tag.value_var], at.grad(model1.logpt, m.tag.value_var))
+            grad1 = aesara.function([m.tag.value_var], aet.grad(model1.logpt, m.tag.value_var))
         with pm.Model() as model2:
             m = Normal("m")
             shavar = aesara.shared(np.ones((1000, 100)))
             Normal("n", observed=shavar)
-            grad2 = aesara.function([m.tag.value_var], at.grad(model2.logpt, m.tag.value_var))
+            grad2 = aesara.function([m.tag.value_var], aet.grad(model2.logpt, m.tag.value_var))
 
         for i in range(10):
             shavar.set_value(np.ones((100, 100)) * i)
diff --git a/pymc3/tests/test_missing.py b/pymc3/tests/test_missing.py
index 67f6635695..187a9b2265 100644
--- a/pymc3/tests/test_missing.py
+++ b/pymc3/tests/test_missing.py
@@ -22,12 +22,27 @@
 from pymc3 import ImputationWarning, Model, Normal, sample, sample_prior_predictive
 
 
-@pytest.mark.parametrize(
-    "data",
-    [ma.masked_values([1, 2, -1, 4, -1], value=-1), pd.DataFrame([1, 2, numpy.nan, 4, numpy.nan])],
-)
-def test_missing(data):
+@pytest.mark.xfail("Missing values not fully refactored")
+def test_missing():
+    data = ma.masked_values([1, 2, -1, 4, -1], value=-1)
+    with Model() as model:
+        x = Normal("x", 1, 1)
+        with pytest.warns(ImputationWarning):
+            Normal("y", x, 1, observed=data)
+
+    (y_missing,) = model.missing_values
+    assert y_missing.tag.test_value.shape == (2,)
+
+    model.logp(model.test_point)
+
+    with model:
+        prior_trace = sample_prior_predictive()
+    assert {"x", "y"} <= set(prior_trace.keys())
+
 
+@pytest.mark.xfail(reason="Missing values not fully refactored")
+def test_missing_pandas():
+    data = pd.DataFrame([1, 2, numpy.nan, 4, numpy.nan])
     with Model() as model:
         x = Normal("x", 1, 1)
         with pytest.warns(ImputationWarning):
@@ -43,6 +58,7 @@ def test_missing(data):
     assert {"x", "y"} <= set(prior_trace.keys())
 
 
+@pytest.mark.xfail(reason="Missing values not fully refactored")
 def test_missing_with_predictors():
     predictors = array([0.5, 1, 0.5, 2, 0.3])
     data = ma.masked_values([1, 2, -1, 4, -1], value=-1)
diff --git a/pymc3/tests/test_model.py b/pymc3/tests/test_model.py
index ff43746865..fc7a647340 100644
--- a/pymc3/tests/test_model.py
+++ b/pymc3/tests/test_model.py
@@ -363,7 +363,6 @@ def test_multiple_observed_rv():
     assert not model["x"] in model.value_vars
 
 
-# @pytest.mark.xfail(reason="Functions depend on deprecated dshape/dsize")
 def test_tempered_logp_dlogp():
     with pm.Model() as model:
         pm.Normal("x")
diff --git a/pymc3/tests/test_model_helpers.py b/pymc3/tests/test_model_helpers.py
index 00a2d0b477..93fdb97259 100644
--- a/pymc3/tests/test_model_helpers.py
+++ b/pymc3/tests/test_model_helpers.py
@@ -108,7 +108,8 @@ def test_pandas_to_array(self, input_dtype):
         # Make sure the returned object is a Aesara TensorVariable
         assert isinstance(wrapped, TensorVariable)
 
-    def test_as_tensor(self):
+    @pytest.mark.xfail(reason="`Observed` `Op` doesn't take `SparseConstant`s, yet")
+    def test_make_obs_var(self):
         """
         Check returned values for `data` given known inputs to `as_tensor()`.
 
@@ -128,13 +129,14 @@ def test_as_tensor(self):
             # Create the testval attribute simply for the sake of model testing
             fake_distribution.testval = None
 
-        # Alias the function to be tested
-        func = pm.model.make_obs_var
-
         # Check function behavior using the various inputs
-        dense_output = func(dense_input, input_name, fake_model, fake_distribution)
-        sparse_output = func(sparse_input, input_name, fake_model, fake_distribution)
-        masked_output = func(masked_array_input, input_name, fake_model, fake_distribution)
+        dense_output = pm.model.make_obs_var(fake_distribution, dense_input, input_name, fake_model)
+        sparse_output = pm.model.make_obs_var(
+            fake_distribution, sparse_input, input_name, fake_model
+        )
+        masked_output = pm.model.make_obs_var(
+            fake_distribution, masked_array_input, input_name, fake_model
+        )
 
         # Ensure that the missing values are appropriately set to None
         for func_output in [dense_output, sparse_output]:

From 2596a75da246950acb27f2eaaa86f836d42c7f9c Mon Sep 17 00:00:00 2001
From: Michael Osthege <m.osthege@fz-juelich.de>
Date: Mon, 8 Mar 2021 22:11:53 +0100
Subject: [PATCH 05/44] Apply easy fixes to get tests to pass or xfail

---
 pymc3/gp/gp.py                     | 4 ++--
 pymc3/tests/models.py              | 2 +-
 pymc3/tests/test_coords.py         | 2 +-
 pymc3/tests/test_data_container.py | 4 +---
 pymc3/tests/test_gp.py             | 2 --
 pymc3/tests/test_missing.py        | 2 +-
 pymc3/tests/test_model.py          | 2 +-
 pymc3/tests/test_quadpotential.py  | 1 +
 8 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/pymc3/gp/gp.py b/pymc3/gp/gp.py
index 17e232f0c2..fa0507309c 100644
--- a/pymc3/gp/gp.py
+++ b/pymc3/gp/gp.py
@@ -280,7 +280,7 @@ def _build_prior(self, name, X, reparameterize=True, **kwargs):
         if reparameterize:
             chi2 = pm.ChiSquared(name + "_chi2_", self.nu)
             v = pm.Normal(name + "_rotated_", mu=0.0, sigma=1.0, size=shape, **kwargs)
-            f = pm.Deterministic(name, (at.sqrt(self.nu) / chi2) * (mu + cholesky(cov).dot(v)))
+            f = pm.Deterministic(name, (aet.sqrt(self.nu) / chi2) * (mu + cholesky(cov).dot(v)))
         else:
             f = pm.MvStudentT(name, nu=self.nu, mu=mu, cov=cov, size=shape, **kwargs)
         return f
@@ -891,7 +891,7 @@ def _build_prior(self, name, Xs, **kwargs):
         chols = [cholesky(stabilize(cov(X))) for cov, X in zip(self.cov_funcs, Xs)]
         # remove reparameterization option
         v = pm.Normal(name + "_rotated_", mu=0.0, sigma=1.0, size=self.N, **kwargs)
-        f = pm.Deterministic(name, mu + at.flatten(kron_dot(chols, v)))
+        f = pm.Deterministic(name, mu + aet.flatten(kron_dot(chols, v)))
         return f
 
     def prior(self, name, Xs, **kwargs):
diff --git a/pymc3/tests/models.py b/pymc3/tests/models.py
index 3d4b471e7a..b2f98ef87b 100644
--- a/pymc3/tests/models.py
+++ b/pymc3/tests/models.py
@@ -30,7 +30,7 @@ def simple_model():
     mu = -2.1
     tau = 1.3
     with Model() as model:
-        Normal("x", mu, tau=tau, size=2, testval=np.ones(2) * 0.1)
+        Normal("x", mu, tau=tau, size=2, testval=floatX_array([0.1, 0.1]))
 
     return model.initial_point, model, (mu, tau ** -0.5)
 
diff --git a/pymc3/tests/test_coords.py b/pymc3/tests/test_coords.py
index f8ba32dafa..c668b1e147 100644
--- a/pymc3/tests/test_coords.py
+++ b/pymc3/tests/test_coords.py
@@ -4,7 +4,7 @@
 import pymc3 as pm
 
 
-@pytest.mark.xfail("Arviz incompatibilities")
+@pytest.mark.xfail(reason="Arviz incompatibilities")
 def test_coords():
     chains = 2
     n_features = 3
diff --git a/pymc3/tests/test_data_container.py b/pymc3/tests/test_data_container.py
index d6252470c6..3050893031 100644
--- a/pymc3/tests/test_data_container.py
+++ b/pymc3/tests/test_data_container.py
@@ -159,9 +159,7 @@ def test_shared_data_as_rv_input(self):
         with pm.Model() as m:
             x = pm.Data("x", [1.0, 2.0, 3.0])
             _ = pm.Normal("y", mu=x, size=3)
-            trace = pm.sample(
-                chains=1, return_inferencedata=False, compute_convergence_checks=False
-            )
+            trace = pm.sample(chains=1)
 
         np.testing.assert_allclose(np.array([1.0, 2.0, 3.0]), x.get_value(), atol=1e-1)
         np.testing.assert_allclose(np.array([1.0, 2.0, 3.0]), trace["y"].mean(0), atol=1e-1)
diff --git a/pymc3/tests/test_gp.py b/pymc3/tests/test_gp.py
index 461096906c..bb85ef83ea 100644
--- a/pymc3/tests/test_gp.py
+++ b/pymc3/tests/test_gp.py
@@ -26,8 +26,6 @@
 
 from pymc3.math import cartesian, kronecker
 
-pytestmark = pytest.mark.xfail(reason="GP not refactored")
-
 np.random.seed(101)
 
 
diff --git a/pymc3/tests/test_missing.py b/pymc3/tests/test_missing.py
index 187a9b2265..5ba71651b6 100644
--- a/pymc3/tests/test_missing.py
+++ b/pymc3/tests/test_missing.py
@@ -22,7 +22,7 @@
 from pymc3 import ImputationWarning, Model, Normal, sample, sample_prior_predictive
 
 
-@pytest.mark.xfail("Missing values not fully refactored")
+@pytest.mark.xfail(reason="Missing values not fully refactored")
 def test_missing():
     data = ma.masked_values([1, 2, -1, 4, -1], value=-1)
     with Model() as model:
diff --git a/pymc3/tests/test_model.py b/pymc3/tests/test_model.py
index fc7a647340..85b515a3b6 100644
--- a/pymc3/tests/test_model.py
+++ b/pymc3/tests/test_model.py
@@ -162,7 +162,7 @@ def test_observed_rv_fail(self):
                 Normal("n", observed=x)
 
     def test_observed_type(self):
-        X_ = pm.floatX(np.random.randn(100, 5))
+        X_ = np.random.randn(100, 5).astype(aesara.config.floatX)
         X = pm.floatX(aesara.shared(X_))
         with pm.Model():
             x1 = pm.Normal("x1", observed=X_)
diff --git a/pymc3/tests/test_quadpotential.py b/pymc3/tests/test_quadpotential.py
index 2b96b2149e..e92b42fb40 100644
--- a/pymc3/tests/test_quadpotential.py
+++ b/pymc3/tests/test_quadpotential.py
@@ -263,6 +263,7 @@ def test_full_adapt_warn():
         quadpotential.QuadPotentialFullAdapt(2, np.zeros(2), np.eye(2), 0)
 
 
+@pytest.mark.xfail(reason="MvNormal was not yet refactored")
 def test_full_adapt_sampling(seed=289586):
     np.random.seed(seed)
 

From 8bd473327d53547dbc0e2f2e30a858fd3d92064b Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Tue, 9 Mar 2021 16:41:52 -0600
Subject: [PATCH 06/44] Reinstate log-likelihood transforms

---
 pymc3/backends/base.py              |  13 +-
 pymc3/distributions/__init__.py     | 212 ++++++++++++++++------------
 pymc3/distributions/continuous.py   |  38 +++--
 pymc3/distributions/distribution.py |  84 +----------
 pymc3/distributions/transforms.py   |  76 ----------
 pymc3/model.py                      |  75 ++++++----
 pymc3/tests/test_transforms.py      |  69 +++++----
 7 files changed, 253 insertions(+), 314 deletions(-)

diff --git a/pymc3/backends/base.py b/pymc3/backends/base.py
index 3a5f37f3fa..2187ed914b 100644
--- a/pymc3/backends/base.py
+++ b/pymc3/backends/base.py
@@ -61,7 +61,18 @@ def __init__(self, name, model=None, vars=None, test_point=None):
         model = modelcontext(model)
         self.model = model
         if vars is None:
-            vars = [v.tag.value_var for v in model.unobserved_RVs]
+            vars = []
+            for v in model.unobserved_RVs:
+                var = getattr(v.tag, "value_var", v)
+                transform = getattr(var.tag, "transform", None)
+                if transform:
+                    # We need to create and add an un-transformed version of
+                    # each transformed variable
+                    untrans_var = transform.backward(var)
+                    untrans_var.name = v.name
+                    vars.append(untrans_var)
+                vars.append(var)
+
         self.vars = vars
         self.varnames = [var.name for var in vars]
         self.fn = model.fastfn(vars)
diff --git a/pymc3/distributions/__init__.py b/pymc3/distributions/__init__.py
index bd4ab5f1c3..241b2b54a7 100644
--- a/pymc3/distributions/__init__.py
+++ b/pymc3/distributions/__init__.py
@@ -12,6 +12,7 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 from functools import singledispatch
+from itertools import chain
 from typing import Generator, List, Optional, Tuple, Union
 
 import aesara.tensor as aet
@@ -31,6 +32,11 @@
 ]
 
 
+@singledispatch
+def logp_transform(op, inputs):
+    return None
+
+
 def _get_scaling(total_size, shape, ndim):
     """
     Gets scaling constant for logp
@@ -135,7 +141,6 @@ def change_rv_size(
 
 def rv_log_likelihood_args(
     rv_var: TensorVariable,
-    rv_value: Optional[TensorVariable] = None,
     transformed: Optional[bool] = True,
 ) -> Tuple[TensorVariable, TensorVariable]:
     """Get a `RandomVariable` and its corresponding log-likelihood `TensorVariable` value.
@@ -146,38 +151,24 @@ def rv_log_likelihood_args(
         A variable corresponding to a `RandomVariable`, whether directly or
         indirectly (e.g. an observed variable that's the output of an
         `Observed` `Op`).
-    rv_value
-        The measure-space input `TensorVariable` (i.e. "input" to a
-        log-likelihood).
     transformed
         When ``True``, return the transformed value var.
 
     Returns
     =======
     The first value in the tuple is the `RandomVariable`, and the second is the
-    measure-space variable that corresponds with the latter.  The first is used
-    to determine the log likelihood graph and the second is the "input"
-    parameter to that graph.  In the case of an observed `RandomVariable`, the
-    "input" is actual data; in all other cases, it's just another
-    `TensorVariable`.
+    measure-space variable that corresponds with the latter (i.e. the "value"
+    variable).
 
     """
 
-    if rv_value is None:
-        if rv_var.owner and isinstance(rv_var.owner.op, Observed):
-            rv_var, rv_value = rv_var.owner.inputs
-        elif hasattr(rv_var.tag, "value_var"):
-            rv_value = rv_var.tag.value_var
-        else:
-            return rv_var, None
-
-    rv_value = aet.as_tensor_variable(rv_value)
-
-    transform = getattr(rv_value.tag, "transform", None)
-    if transformed and transform:
-        rv_value = transform.forward(rv_value)
-
-    return rv_var, rv_value
+    if rv_var.owner and isinstance(rv_var.owner.op, Observed):
+        return tuple(rv_var.owner.inputs)
+    elif hasattr(rv_var.tag, "value_var"):
+        rv_value = rv_var.tag.value_var
+        return rv_var, rv_value
+    else:
+        return rv_var, None
 
 
 def rv_ancestors(graphs: List[TensorVariable]) -> Generator[TensorVariable, None, None]:
@@ -197,23 +188,53 @@ def strip_observed(x: TensorVariable) -> TensorVariable:
         return x
 
 
-def sample_to_measure_vars(graphs: List[TensorVariable]) -> List[TensorVariable]:
-    """Replace `RandomVariable` terms in graphs with their measure-space counterparts."""
+def sample_to_measure_vars(
+    graphs: List[TensorVariable],
+) -> Tuple[List[TensorVariable], List[TensorVariable]]:
+    """Replace sample-space variables in graphs with their measure-space counterparts.
+
+    Sample-space variables are `TensorVariable` outputs of `RandomVariable`
+    `Op`s.  Measure-space variables are `TensorVariable`s that correspond to
+    the value of a sample-space variable in a likelihood function (e.g. ``x``
+    in ``p(X = x)``, where ``X`` is the corresponding sample-space variable).
+    (``x`` is also the variable found in ``rv_var.tag.value_var``, so this
+    function could also be called ``sample_to_value_vars``.)
+
+    Parameters
+    ==========
+    graphs
+        The graphs in which random variables are to be replaced by their
+        measure variables.
+
+    Returns
+    =======
+    Tuple containing the transformed graphs and a ``dict`` of the replacements
+    that were made.
+    """
     replace = {}
-    for anc in ancestors(graphs):
-        if anc.owner and isinstance(anc.owner.op, RandomVariable):
-            measure_var = getattr(anc.tag, "value_var", None)
-            if measure_var is not None:
-                replace[anc] = measure_var
+    for anc in chain(rv_ancestors(graphs), graphs):
 
-    dist_params = clone_replace(graphs, replace=replace)
-    return dist_params
+        if not (anc.owner and isinstance(anc.owner.op, RandomVariable)):
+            continue
+
+        _, value_var = rv_log_likelihood_args(anc)
+
+        if value_var is not None:
+            replace[anc] = value_var
+
+    if replace:
+        measure_graphs = clone_replace(graphs, replace=replace)
+    else:
+        measure_graphs = graphs
+
+    return measure_graphs, replace
 
 
 def logpt(
     rv_var: TensorVariable,
     rv_value: Optional[TensorVariable] = None,
-    jacobian: bool = True,
+    jacobian: Optional[bool] = True,
+    transformed: Optional[bool] = True,
     scaling: Optional[bool] = True,
     **kwargs,
 ) -> TensorVariable:
@@ -229,15 +250,26 @@ def logpt(
     rv_var
         The `RandomVariable` output that determines the log-likelihood graph.
     rv_value
-        The input variable for the log-likelihood graph.
+        The input variable for the log-likelihood graph.  If `rv_value` is
+        a transformed variable, its transformations will be applied.
+        If no value is provided, `rv_var.tag.value_var` will be checked and,
+        when available, used.
     jacobian
         Whether or not to include the Jacobian term.
+    transformed
+        Return the transformed version of the log-likelihood graph.
     scaling
         A scaling term to apply to the generated log-likelihood graph.
 
     """
 
-    rv_var, rv_value = rv_log_likelihood_args(rv_var, rv_value)
+    rv_var, rv_value_var = rv_log_likelihood_args(rv_var)
+
+    if rv_value is None:
+        rv_value = rv_value_var
+    else:
+        rv_value = aet.as_tensor(rv_value)
+
     rv_node = rv_var.owner
 
     if not rv_node:
@@ -245,13 +277,13 @@ def logpt(
 
     if not isinstance(rv_node.op, RandomVariable):
 
+        # This will probably need another generic function...
         if isinstance(rv_node.op, (Subtensor, AdvancedSubtensor, AdvancedSubtensor1)):
 
             raise NotImplementedError("Missing value support is incomplete")
 
             # "Flatten" and sum an array of indexed RVs' log-likelihoods
             rv_var, missing_values = rv_node.inputs
-            rv_value = rv_var.tag.value_var
 
             missing_values = missing_values.data
             logp_var = aet.sum(
@@ -269,28 +301,40 @@ def logpt(
 
         return aet.zeros_like(rv_var)
 
+    if rv_value_var is None:
+        raise NotImplementedError(f"The log-likelihood for {rv_var} is undefined")
+
+    # This case should be reached when `rv_var` is either the result of an
+    # `Observed` or a `RandomVariable` `Op`
     rng, size, dtype, *dist_params = rv_node.inputs
 
-    dist_params = sample_to_measure_vars(dist_params)
+    dist_params, replacements = sample_to_measure_vars(dist_params)
 
-    if jacobian:
-        logp_var = _logp(rv_node.op, rv_value, *dist_params, **kwargs)
-    else:
-        logp_var = _logp_nojac(rv_node.op, rv_value, *dist_params, **kwargs)
+    logp_var = _logp(rv_node.op, rv_value_var, *dist_params, **kwargs)
+
+    # If any of the measure vars are transformed measure-space variables
+    # (signified by having a `transform` value in their tags), then we apply
+    # the their transforms and add their Jacobians (when enabled)
+    if transformed:
+        logp_var = transform_logp(
+            logp_var,
+            tuple(replacements.values()) + (rv_value_var,),
+        )
 
-    # Replace `RandomVariable` ancestors with their corresponding
-    # log-likelihood input variables
-    lik_replacements = [
-        (v, v.tag.value_var)
-        for v in ancestors([logp_var])
-        if v.owner and isinstance(v.owner.op, RandomVariable) and getattr(v.tag, "value_var", None)
-    ]
+        transform = getattr(rv_value_var.tag, "transform", None)
 
-    (logp_var,) = clone_replace([logp_var], replace=lik_replacements)
+        if transform and jacobian:
+            transformed_jacobian = transform.jacobian_det(rv_value_var)
+            if transformed_jacobian:
+                if logp_var.ndim > transformed_jacobian.ndim:
+                    logp_var = logp_var.sum(axis=-1)
+                logp_var += transformed_jacobian
+
+        (logp_var,) = clone_replace([logp_var], replace={rv_value_var: rv_value})
 
     if scaling:
         logp_var *= _get_scaling(
-            getattr(rv_var.tag, "total_size", None), rv_value.shape, rv_value.ndim
+            getattr(rv_var.tag, "total_size", None), rv_value_var.shape, rv_value_var.ndim
         )
 
     if rv_var.name is not None:
@@ -299,6 +343,25 @@ def logpt(
     return logp_var
 
 
+def transform_logp(logp_var: TensorVariable, inputs: List[TensorVariable]) -> TensorVariable:
+    """Transform the inputs of a log-likelihood graph."""
+    trans_replacements = {}
+    for measure_var in inputs:
+
+        transform = getattr(measure_var.tag, "transform", None)
+
+        if transform is None:
+            continue
+
+        trans_rv_value = transform.backward(measure_var)
+        trans_replacements[measure_var] = trans_rv_value
+
+    if trans_replacements:
+        (logp_var,) = clone_replace([logp_var], trans_replacements)
+
+    return logp_var
+
+
 @singledispatch
 def _logp(op, value, *dist_params, **kwargs):
     """Create a log-likelihood graph.
@@ -311,10 +374,10 @@ def _logp(op, value, *dist_params, **kwargs):
     return aet.zeros_like(value)
 
 
-def logcdf(rv_var, rv_value, **kwargs):
+def logcdf(rv_var, rv_value, transformed=True, jacobian=True, **kwargs):
     """Create a log-CDF graph."""
 
-    rv_var, rv_value = rv_log_likelihood_args(rv_var, rv_value)
+    rv_var, rv_value = rv_log_likelihood_args(rv_var)
     rv_node = rv_var.owner
 
     if not rv_node:
@@ -322,9 +385,16 @@ def logcdf(rv_var, rv_value, **kwargs):
 
     rng, size, dtype, *dist_params = rv_node.inputs
 
-    dist_params = sample_to_measure_vars(dist_params)
+    dist_params, replacements = sample_to_measure_vars(dist_params)
 
-    return _logcdf(rv_node.op, rv_value, *dist_params, **kwargs)
+    logp_var = _logcdf(rv_node.op, rv_value, *dist_params, **kwargs)
+
+    if transformed:
+        logp_var = transform_logp(
+            logp_var, tuple(replacements.values()) + (rv_value,), jacobian=jacobian
+        )
+
+    return logp_var
 
 
 @singledispatch
@@ -339,38 +409,6 @@ def _logcdf(op, value, *args, **kwargs):
     raise NotImplementedError()
 
 
-def logp_nojac(rv_var, rv_value=None, **kwargs):
-    """Create a graph of the log-likelihood that doesn't include the Jacobian."""
-
-    rv_var, rv_value = rv_log_likelihood_args(rv_var, rv_value)
-    rv_node = rv_var.owner
-
-    if not rv_node:
-        raise TypeError()
-
-    rng, size, dtype, *dist_params = rv_node.inputs
-
-    dist_params = sample_to_measure_vars(dist_params)
-
-    return _logp_nojac(rv_node.op, rv_value, **kwargs)
-
-
-@singledispatch
-def _logp_nojac(op, value, *args, **kwargs):
-    """Return the logp, but do not include a jacobian term for transforms.
-
-    If we use different parametrizations for the same distribution, we
-    need to add the determinant of the jacobian of the transformation
-    to make sure the densities still describe the same distribution.
-    However, MAP estimates are not invariant with respect to the
-    parameterization, we need to exclude the jacobian terms in this case.
-
-    This function should be overwritten in base classes for transformed
-    distributions.
-    """
-    return logpt(op, value, *args, **kwargs)
-
-
 def logpt_sum(rv_var: TensorVariable, rv_value: Optional[TensorVariable] = None, **kwargs):
     """Return the sum of the logp values for the given observations.
 
diff --git a/pymc3/distributions/continuous.py b/pymc3/distributions/continuous.py
index 535974b7d1..1856bcba9f 100644
--- a/pymc3/distributions/continuous.py
+++ b/pymc3/distributions/continuous.py
@@ -35,7 +35,7 @@
 from scipy.interpolate import InterpolatedUnivariateSpline
 
 from pymc3.aesaraf import floatX
-from pymc3.distributions import _logcdf, _logp, transforms
+from pymc3.distributions import _logcdf, _logp, logp_transform, transforms
 from pymc3.distributions.dist_math import (
     SplineWrapper,
     betaln,
@@ -100,28 +100,41 @@
 class PositiveContinuous(Continuous):
     """Base class for positive continuous distributions"""
 
-    default_transform = transforms.log
-
 
 class UnitContinuous(Continuous):
     """Base class for continuous distributions on [0,1]"""
 
-    default_transform = transforms.logodds
-
 
 class BoundedContinuous(Continuous):
     """Base class for bounded continuous distributions"""
 
-    default_transform = "auto"
-
-    def create_transform(transform="auto", lower=None, upper=None):
 
 @logp_transform.register(PositiveContinuous)
-def pos_cont_transform(op):
+def pos_cont_transform(op, inputs):
     return transforms.log
 
 
-        return transform
+@logp_transform.register(UnitContinuous)
+def unit_cont_transform(op, inputs):
+    return transforms.logodds
+
+
+@logp_transform.register(BoundedContinuous)
+def bounded_cont_transform(op, inputs):
+    _, _, _, lower, upper = inputs
+    lower = aet.as_tensor_variable(lower) if lower is not None else None
+    upper = aet.as_tensor_variable(upper) if upper is not None else None
+
+    if lower is None and upper is None:
+        transform = None
+    elif lower is not None and upper is None:
+        transform = transforms.lowerbound(lower)
+    elif lower is None and upper is not None:
+        transform = transforms.upperbound(upper)
+    else:
+        transform = transforms.interval(lower, upper)
+
+    return transform
 
 
 def assert_negative_support(var, label, distname, value=-1e-6):
@@ -222,11 +235,10 @@ def dist(cls, lower=0, upper=1, **kwargs):
         upper = aet.as_tensor_variable(floatX(upper))
         # mean = (upper + lower) / 2.0
         # median = self.mean
+        return super().dist([lower, upper], **kwargs)
 
-        transform = kwargs.pop("transform", cls.default_transform)
-        transform = cls.create_transform(transform, lower, upper)
 
-        return super().dist([lower, upper], transform=transform, **kwargs)
+BoundedContinuous.register(UniformRV)
 
 
 @_logp.register(UniformRV)
diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py
index bb4a1681e0..acf00ef970 100644
--- a/pymc3/distributions/distribution.py
+++ b/pymc3/distributions/distribution.py
@@ -18,8 +18,7 @@
 import types
 import warnings
 
-from abc import ABCMeta
-from copy import copy
+from abc import ABC
 from typing import TYPE_CHECKING
 
 import dill
@@ -62,76 +61,10 @@ class _Unpickling:
     pass
 
 
-class DistributionMeta(ABCMeta):
-    def __new__(cls, name, bases, clsdict):
-
-        # Forcefully deprecate old v3 `Distribution`s
-        if "random" in clsdict:
-
-            def _random(*args, **kwargs):
-                warnings.warn(
-                    "The old `Distribution.random` interface is deprecated.",
-                    DeprecationWarning,
-                    stacklevel=2,
-                )
-                return clsdict["random"](*args, **kwargs)
-
-            clsdict["random"] = _random
-
-        rv_op = clsdict.setdefault("rv_op", None)
-        rv_type = None
-
-        if isinstance(rv_op, RandomVariable):
-            if not rv_op.inplace:
-                # TODO: This is a temporary work-around.
-                # Remove this once we know what we want regarding RNG states
-                # and their propagation.
-                rv_op = copy(rv_op)
-                rv_op.inplace = True
-                clsdict["rv_op"] = rv_op
-
-            rv_type = type(rv_op)
-
-        new_cls = super().__new__(cls, name, bases, clsdict)
-
-        if rv_type is not None:
-            # Create dispatch functions
-
-            class_logp = clsdict.get("logp")
-            if class_logp:
-
-                @_logp.register(rv_type)
-                def logp(op, var, rvs_to_values, *dist_params, **kwargs):
-                    value_var = rvs_to_values.get(var, var)
-                    return class_logp(value_var, *dist_params, **kwargs)
-
-            class_logcdf = clsdict.get("logcdf")
-            if class_logcdf:
-
-                @_logcdf.register(rv_type)
-                def logcdf(op, var, rvs_to_values, *dist_params, **kwargs):
-                    value_var = rvs_to_values.get(var, var)
-                    return class_logcdf(value_var, *dist_params, **kwargs)
-
-            # class_transform = clsdict.get("transform")
-            # if class_transform:
-            #
-            #     @logp_transform.register(rv_type)
-            #     def transform(op, *args, **kwargs):
-            #         return class_transform(*args, **kwargs)
-
-            # Register the Aesara `RandomVariable` type as a subclass of this
-            # `Distribution` type.
-            new_cls.register(rv_type)
-
-        return new_cls
-
-
-class Distribution(metaclass=DistributionMeta):
+class Distribution(ABC):
     """Statistical distribution"""
 
     rv_op = None
-    default_transform = None
 
     def __new__(cls, name, *args, **kwargs):
         try:
@@ -163,19 +96,19 @@ def __new__(cls, name, *args, **kwargs):
         if "shape" in kwargs:
             raise DeprecationWarning("The `shape` keyword is deprecated; use `size`.")
 
+        transform = kwargs.pop("transform", None)
+
         rv_out = cls.dist(*args, rng=rng, **kwargs)
 
-        return model.register_rv(rv_out, name, data, total_size, dims=dims)
+        return model.register_rv(rv_out, name, data, total_size, dims=dims, transform=transform)
 
     @classmethod
     def dist(cls, dist_params, **kwargs):
-        transform = kwargs.pop("transform", cls.default_transform)
+
         testval = kwargs.pop("testval", None)
 
         rv_var = cls.rv_op(*dist_params, **kwargs)
 
-        rv_var.tag.transform = transform
-
         if testval is not None:
             rv_var.tag.test_value = testval
 
@@ -329,10 +262,7 @@ class Discrete(Distribution):
 
     def __new__(cls, name, *args, **kwargs):
 
-        if kwargs.get("transform", None):
-            raise ValueError("Transformations for discrete distributions")
-
-        return super().__new__(cls, name, *args, **kwargs)
+        super().__init__(shape, dtype, defaults=defaults, *args, **kwargs)
 
 
 class Continuous(Distribution):
diff --git a/pymc3/distributions/transforms.py b/pymc3/distributions/transforms.py
index b6741af6a3..67f47f1028 100644
--- a/pymc3/distributions/transforms.py
+++ b/pymc3/distributions/transforms.py
@@ -15,10 +15,8 @@
 import aesara.tensor as at
 
 from aesara.tensor.subtensor import advanced_set_subtensor1
-from aesara.tensor.type import TensorType
 
 from pymc3.aesaraf import floatX, gradient
-from pymc3.distributions import distribution
 from pymc3.math import invlogit, logit, logsumexp
 
 __all__ = [
@@ -110,80 +108,6 @@ def jacobian_det(self, x):
         return aet.log(aet.abs_(grad))
 
 
-class TransformedDistribution(distribution.Distribution):
-    """A distribution that has been transformed from one space into another."""
-
-    def __init__(self, dist, transform, *args, **kwargs):
-        """
-        Parameters
-        ----------
-        dist: Distribution
-        transform: Transform
-        args, kwargs
-            arguments to Distribution"""
-        forward = transform.forward
-        testval = forward(dist.default())
-
-        self.dist = dist
-        self.transform_used = transform
-        # XXX: `FreeRV` no longer exists
-        v = None  # forward(FreeRV(name="v", distribution=dist))
-        self.type = v.type
-
-        super().__init__(v.shape.tag.test_value, v.dtype, testval, dist.defaults, *args, **kwargs)
-
-        if transform.name == "stickbreaking":
-            b = np.hstack(((np.atleast_1d(self.shape) == 1)[:-1], False))
-            # force the last dim not broadcastable
-            self.type = TensorType(v.dtype, b)
-
-    def logp(self, x):
-        """
-        Calculate log-probability of Transformed distribution at specified value.
-
-        Parameters
-        ----------
-        x: numeric
-            Value for which log-probability is calculated.
-
-        Returns
-        -------
-        TensorVariable
-        """
-        logp_nojac = self.logp_nojac(x)
-        jacobian_det = self.transform_used.jacobian_det(x)
-        if logp_nojac.ndim > jacobian_det.ndim:
-            logp_nojac = logp_nojac.sum(axis=-1)
-        return logp_nojac + jacobian_det
-
-    def logp_nojac(self, x):
-        """
-        Calculate log-probability of Transformed distribution at specified value
-        without jacobian term for transforms.
-
-        Parameters
-        ----------
-        x: numeric
-            Value for which log-probability is calculated.
-
-        Returns
-        -------
-        TensorVariable
-        """
-        return self.dist.logp(self.transform_used.backward(x))
-
-    def _repr_latex_(self, **kwargs):
-        # prevent TransformedDistributions from ending up in LaTeX representations
-        # of models
-        return None
-
-    def _distr_parameters_for_repr(self):
-        return []
-
-
-transform = Transform
-
-
 class Log(ElemwiseTransform):
     name = "log"
 
diff --git a/pymc3/model.py b/pymc3/model.py
index e649b2eb08..9b9601578c 100644
--- a/pymc3/model.py
+++ b/pymc3/model.py
@@ -29,7 +29,7 @@
 
 from aesara.compile.sharedvalue import SharedVariable
 from aesara.gradient import grad
-from aesara.graph.basic import Variable
+from aesara.graph.basic import Constant, Variable, graph_inputs
 from aesara.tensor.random.op import Observed, observed
 from aesara.tensor.var import TensorVariable
 from pandas import Series
@@ -39,7 +39,7 @@
 from pymc3.aesaraf import generator, gradient, hessian, inputvars
 from pymc3.blocking import DictToArrayBijection, RaveledVars
 from pymc3.data import GenTensorVariable, Minibatch
-from pymc3.distributions import change_rv_size, logpt, logpt_sum
+from pymc3.distributions import change_rv_size, logp_transform, logpt, logpt_sum
 from pymc3.exceptions import ImputationWarning
 from pymc3.math import flatten_list
 from pymc3.util import WithMemoization, get_var_name
@@ -680,7 +680,7 @@ def logp_dlogp_function(self, grad_vars=None, tempered=False, **kwargs):
             with self:
                 free_RVs_logp = aet.sum(
                     [
-                        aet.sum(logpt(var, getattr(var.tag, "value_var", None)))
+                        aet.sum(logpt(var, getattr(var.tag, "value_var", None), transformed=True))
                         for var in self.free_RVs + self.potentials
                     ]
                 )
@@ -691,19 +691,17 @@ def logp_dlogp_function(self, grad_vars=None, tempered=False, **kwargs):
             costs = [self.logpt]
 
         input_vars = {i for i in graph_inputs(costs) if not isinstance(i, Constant)}
-        extra_vars = [getattr(var.tag, "value_var", var) for var in self.free_RVs]
-        extra_vars_and_values = {
-            var: self.initial_point[var.name]
-            for var in extra_vars
-            if var in input_vars and var not in grad_vars
-        }
-        return ValueGradFunction(costs, grad_vars, extra_vars_and_values, **kwargs)
+        extra_vars = [var for var in self.free_RVs if var in input_vars]
+        return ValueGradFunction(costs, grad_vars, extra_vars, **kwargs)
 
     @property
     def logpt(self):
         """Aesara scalar of log-probability of the model"""
         with self:
-            factors = [logpt_sum(var, getattr(var.tag, "value_var", None)) for var in self.free_RVs]
+            factors = [
+                logpt_sum(var, getattr(var.tag, "value_var", None), transformed=True)
+                for var in self.free_RVs
+            ]
             factors += [logpt_sum(obs) for obs in self.observed_RVs]
             factors += self.potentials
             logp_var = aet.sum([aet.sum(factor) for factor in factors])
@@ -723,7 +721,9 @@ def logp_nojact(self):
         """
         with self:
             factors = [
-                logpt_sum(var, getattr(var.tag, "value_var", None), jacobian=False)
+                logpt_sum(
+                    var, getattr(var.tag, "value_var", None), jacobian=False, transformed=True
+                )
                 for var in self.free_RVs
             ]
             factors += [logpt_sum(obs, jacobian=False) for obs in self.observed_RVs]
@@ -740,7 +740,10 @@ def varlogpt(self):
         """Aesara scalar of log-probability of the unobserved random variables
         (excluding deterministic)."""
         with self:
-            factors = [logpt_sum(var, getattr(var.tag, "value_var", None)) for var in self.free_RVs]
+            factors = [
+                logpt_sum(var, getattr(var.tag, "value_var", None), transformed=True)
+                for var in self.free_RVs
+            ]
             return aet.sum(factors)
 
     @property
@@ -794,7 +797,7 @@ def independent_vars(self):
     @property
     def test_point(self):
         """Test point used to check that the model doesn't generate errors"""
-        return Point(((var.tag.value_var, var.tag.test_value) for var in self.free_RVs), model=self)
+        return Point(((var, var.tag.test_value) for var in self.vars), model=self)
 
     @property
     def disc_vars(self):
@@ -836,7 +839,7 @@ def add_coords(self, coords):
             else:
                 self.coords[name] = coords[name]
 
-    def register_rv(self, rv_var, name, data=None, total_size=None, dims=None):
+    def register_rv(self, rv_var, name, data=None, total_size=None, dims=None, transform=None):
         """Register an (un)observed random variable with the model.
 
         Parameters
@@ -868,17 +871,24 @@ def register_rv(self, rv_var, name, data=None, total_size=None, dims=None):
             # In all other cases, the role of the value variable is taken by
             # observed data. That's why value variables are only referenced in
             # this branch of the conditional.
-            value_var = rv_var.clone()
-            value_var.name = rv_var.name
-            rv_var.tag.value_var = value_var
-
             self.free_RVs.append(rv_var)
+            value_var = rv_var.clone()
 
-            transform = rv_var.tag.transform
-            value_var.tag.transform = None
-
+            transform = transform or logp_transform(rv_var.owner.op, rv_var.owner.inputs)
             if transform is not None:
-                self.deterministics.append(rv_var)
+                value_var.tag.transform = transform
+                value_var.name = f"{rv_var.name}_{transform.name}"
+                if aesara.config.compute_test_value != "off":
+                    value_var.tag.test_value = transform.forward(value_var).tag.test_value
+
+                # The transformed variable needs to be a named variable in the
+                # model, too
+                self.named_vars[value_var.name] = value_var
+            else:
+                value_var = rv_var.clone()
+                value_var.name = rv_var.name
+
+            rv_var.tag.value_var = value_var
 
         elif isinstance(data, dict):
 
@@ -977,7 +987,7 @@ def __getitem__(self, key):
             except KeyError:
                 raise e
 
-    def makefn(self, outs, mode=None, *args, **kwargs):
+    def makefn(self, outs, mode=None, transformed=True, *args, **kwargs):
         """Compiles a Aesara function which returns ``outs`` and takes the variable
         ancestors of ``outs`` as inputs.
 
@@ -991,8 +1001,11 @@ def makefn(self, outs, mode=None, *args, **kwargs):
         Compiled Aesara function
         """
         with self:
+            vars = [
+                v if not transformed else getattr(v.tag, "transformed_var", v) for v in self.vars
+            ]
             return aesara.function(
-                self.value_vars,
+                vars,
                 outs,
                 allow_input_downcast=True,
                 on_unused_input="ignore",
@@ -1343,7 +1356,7 @@ def fastfn(outs, mode=None, model=None):
     return model.fastfn(outs, mode)
 
 
-def Point(*args, filter_model_vars=True, **kwargs):
+def Point(*args, filter_model_vars=False, **kwargs):
     """Build a point. Uses same args as dict() does.
     Filters out variables not in the model. All keys are strings.
 
@@ -1485,6 +1498,13 @@ def make_obs_var(
         # We try to reuse the old test value
         rv_var.tag.test_value = np.broadcast_to(test_value, rv_var.tag.test_value.shape)
 
+    # An independent variable used as the generic log-likelihood input
+    # parameter (i.e. the measure-space counterpart to the sample-space
+    # variable `rv_var`).
+    value_var = rv_var.clone()
+    rv_var.tag.value_var = value_var
+    value_var.name = f"{rv_var.name}"
+
     missing_values = None
     mask = getattr(data, "mask", None)
     if mask is not None:
@@ -1500,9 +1520,7 @@ def make_obs_var(
         data = aet.set_subtensor(constant[mask.nonzero()], missing_values)
 
         # Now, we need log-likelihood-space terms for these missing values
-        value_var = rv_var.clone()
         value_var.name = f"{rv_var.name}_missing"
-        rv_var.tag.value_var = value_var
 
     elif sps.issparse(data):
         data = sparse.basic.as_sparse(data, name=name)
@@ -1590,7 +1608,6 @@ def Potential(name, var, model=None):
     model = modelcontext(model)
     var.name = model.name_for(name)
     var.tag.scaling = None
-    var.tag.transform = None
     model.potentials.append(var)
     model.add_random_variable(var)
     return var
diff --git a/pymc3/tests/test_transforms.py b/pymc3/tests/test_transforms.py
index fd32d8b9b6..1d52fdf662 100644
--- a/pymc3/tests/test_transforms.py
+++ b/pymc3/tests/test_transforms.py
@@ -229,7 +229,7 @@ def test_interval_near_boundary():
     with pm.Model() as model:
         pm.Uniform("x", testval=x0, lower=lb, upper=ub)
 
-    log_prob = model.point_logps()
+    log_prob = model.check_test_point()
     np.testing.assert_allclose(log_prob, np.array([-52.68]))
 
 
@@ -286,34 +286,38 @@ def check_transform_elementwise_logp(self, model):
         x0 = x.tag.value_var
         assert x.ndim == logpt(x).ndim
 
-        pt = model.initial_point
+        pt = model.test_point
         array = np.random.randn(*pt[x0.name].shape)
         transform = x0.tag.transform
-        logp_notrans = logpt(x, transform.backward(x, array), transformed=False)
+        logp_nojac = logpt(x, transform.backward(array), jacobian=False)
+        jacob_det = transform.jacobian_det(aesara.shared(array))
+        assert logpt(x).ndim == jacob_det.ndim
 
         jacob_det = transform.jacobian_det(x, aesara.shared(array))
         assert logpt(x).ndim == jacob_det.ndim
 
-        v1 = logpt(x, array, jacobian=False).eval()
-        v2 = logp_notrans.eval()
-        close_to(v1, v2, tol)
+        close_to(logpt(x, array).eval(), elementwiselogp.eval(), tol)
 
     def check_vectortransform_elementwise_logp(self, model, vect_opt=0):
         x = model.free_RVs[0]
         x0 = x.tag.value_var
         assert (x.ndim - 1) == logpt(x).ndim
 
-        pt = model.initial_point
+        pt = model.test_point
         array = np.random.randn(*pt[x0.name].shape)
         transform = x0.tag.transform
-        logp_nojac = logpt(x, transform.backward(x, array), transformed=False)
-
-        jacob_det = transform.jacobian_det(x, aesara.shared(array))
+        logp_nojac = logpt(x, transform.backward(array))
+        jacob_det = transform.jacobian_det(aesara.shared(array))
         assert logpt(x).ndim == jacob_det.ndim
 
+        if vect_opt == 0:
+            # the original distribution is univariate
+            elementwiselogp = logp_nojac.sum(axis=-1) + jacob_det
+        else:
+            elementwiselogp = logp_nojac + jacob_det
         # Hack to get relative tolerance
-        a = logpt(x, array.astype(aesara.config.floatX), jacobian=False).eval()
-        b = logp_nojac.eval()
+        a = logpt(x, array).eval()
+        b = elementwiselogp.eval()
         close_to(a, b, np.abs(0.5 * (a + b) * tol))
 
     @pytest.mark.parametrize(
@@ -324,13 +328,15 @@ def check_vectortransform_elementwise_logp(self, model, vect_opt=0):
             (np.ones(3) * 10.0, (4, 3)),
         ],
     )
-    def test_half_normal(self, sd, size):
-        model = self.build_model(pm.HalfNormal, {"sd": sd}, size=size, transform=tr.log)
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
+    def test_half_normal(self, sd, shape):
+        model = self.build_model(pm.HalfNormal, {"sd": sd}, size=shape, transform=tr.log)
         self.check_transform_elementwise_logp(model)
 
-    @pytest.mark.parametrize("lam,size", [(2.5, 2), (5.0, (2, 3)), (np.ones(3), (4, 3))])
-    def test_exponential(self, lam, size):
-        model = self.build_model(pm.Exponential, {"lam": lam}, size=size, transform=tr.log)
+    @pytest.mark.parametrize("lam,shape", [(2.5, 2), (5.0, (2, 3)), (np.ones(3), (4, 3))])
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
+    def test_exponential(self, lam, shape):
+        model = self.build_model(pm.Exponential, {"lam": lam}, size=shape, transform=tr.log)
         self.check_transform_elementwise_logp(model)
 
     @pytest.mark.parametrize(
@@ -341,8 +347,9 @@ def test_exponential(self, lam, size):
             (np.ones(3), np.ones(3), (4, 3)),
         ],
     )
-    def test_beta(self, a, b, size):
-        model = self.build_model(pm.Beta, {"alpha": a, "beta": b}, size=size, transform=tr.logodds)
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
+    def test_beta(self, a, b, shape):
+        model = self.build_model(pm.Beta, {"alpha": a, "beta": b}, size=shape, transform=tr.logodds)
         self.check_transform_elementwise_logp(model)
 
     @pytest.mark.parametrize(
@@ -362,7 +369,7 @@ def transform_params(rv_var):
 
         interval = tr.Interval(transform_params)
         model = self.build_model(
-            pm.Uniform, {"lower": lower, "upper": upper}, size=size, transform=interval
+            pm.Uniform, {"lower": lower, "upper": upper}, size=shape, transform=interval
         )
         self.check_transform_elementwise_logp(model)
 
@@ -370,17 +377,17 @@ def transform_params(rv_var):
         "mu,kappa,size", [(0.0, 1.0, 2), (-0.5, 5.5, (2, 3)), (np.zeros(3), np.ones(3), (4, 3))]
     )
     @pytest.mark.xfail(reason="Distribution not refactored yet")
-    def test_vonmises(self, mu, kappa, size):
+    def test_vonmises(self, mu, kappa, shape):
         model = self.build_model(
-            pm.VonMises, {"mu": mu, "kappa": kappa}, size=size, transform=tr.circular
+            pm.VonMises, {"mu": mu, "kappa": kappa}, size=shape, transform=tr.circular
         )
         self.check_transform_elementwise_logp(model)
 
     @pytest.mark.parametrize(
         "a,size", [(np.ones(2), None), (np.ones((2, 3)) * 0.5, None), (np.ones(3), (4,))]
     )
-    def test_dirichlet(self, a, size):
-        model = self.build_model(pm.Dirichlet, {"a": a}, size=size, transform=tr.stick_breaking)
+    def test_dirichlet(self, a, shape):
+        model = self.build_model(pm.Dirichlet, {"a": a}, size=shape, transform=tr.stick_breaking)
         self.check_vectortransform_elementwise_logp(model, vect_opt=1)
 
     def test_normal_ordered(self):
@@ -406,7 +413,7 @@ def test_half_normal_ordered(self, sd, size):
         model = self.build_model(
             pm.HalfNormal,
             {"sd": sd},
-            size=size,
+            size=shape,
             testval=testval,
             transform=tr.Chain([tr.log, tr.ordered]),
         )
@@ -418,7 +425,7 @@ def test_exponential_ordered(self, lam, size):
         model = self.build_model(
             pm.Exponential,
             {"lam": lam},
-            size=size,
+            size=shape,
             testval=testval,
             transform=tr.Chain([tr.log, tr.ordered]),
         )
@@ -436,7 +443,7 @@ def test_beta_ordered(self, a, b, size):
         model = self.build_model(
             pm.Beta,
             {"alpha": a, "beta": b},
-            size=size,
+            size=shape,
             testval=testval,
             transform=tr.Chain([tr.logodds, tr.ordered]),
         )
@@ -459,7 +466,7 @@ def transform_params(rv_var):
         model = self.build_model(
             pm.Uniform,
             {"lower": lower, "upper": upper},
-            size=size,
+            size=shape,
             testval=testval,
             transform=tr.Chain([interval, tr.ordered]),
         )
@@ -472,7 +479,7 @@ def test_vonmises_ordered(self, mu, kappa, size):
         model = self.build_model(
             pm.VonMises,
             {"mu": mu, "kappa": kappa},
-            size=size,
+            size=shape,
             testval=testval,
             transform=tr.Chain([tr.circular, tr.ordered]),
         )
@@ -491,7 +498,7 @@ def test_uniform_other(self, lower, upper, size, transform):
         model = self.build_model(
             pm.Uniform,
             {"lower": lower, "upper": upper},
-            size=size,
+            size=shape,
             testval=testval,
             transform=transform,
         )
@@ -507,6 +514,6 @@ def test_uniform_other(self, lower, upper, size, transform):
     def test_mvnormal_ordered(self, mu, cov, size, shape):
         testval = np.sort(np.random.randn(*shape))
         model = self.build_model(
-            pm.MvNormal, {"mu": mu, "cov": cov}, size=size, testval=testval, transform=tr.ordered
+            pm.MvNormal, {"mu": mu, "cov": cov}, size=shape, testval=testval, transform=tr.ordered
         )
         self.check_vectortransform_elementwise_logp(model, vect_opt=1)

From b7b2963378a36137fe7891512f6415b3476c1342 Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Sat, 13 Mar 2021 23:28:17 -0600
Subject: [PATCH 07/44] Remove remaining v3 sampling code

---
 pymc3/tests/test_data_container.py       | 4 ++--
 pymc3/tests/test_distributions_random.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pymc3/tests/test_data_container.py b/pymc3/tests/test_data_container.py
index 3050893031..1dde0e7640 100644
--- a/pymc3/tests/test_data_container.py
+++ b/pymc3/tests/test_data_container.py
@@ -180,7 +180,7 @@ def test_shared_scalar_as_rv_input(self):
             v = pm.Normal("v", mu=shared_var, size=1)
 
         np.testing.assert_allclose(
-            logpt(v, 5.0).eval(),
+            logpt(v, np.r_[5.0]).eval(),
             -0.91893853,
             rtol=1e-5,
         )
@@ -188,7 +188,7 @@ def test_shared_scalar_as_rv_input(self):
         shared_var.set_value(10.0)
 
         np.testing.assert_allclose(
-            logpt(v, 10.0).eval(),
+            logpt(v, np.r_[10.0]).eval(),
             -0.91893853,
             rtol=1e-5,
         )
diff --git a/pymc3/tests/test_distributions_random.py b/pymc3/tests/test_distributions_random.py
index b5cec04986..0123a3b5c8 100644
--- a/pymc3/tests/test_distributions_random.py
+++ b/pymc3/tests/test_distributions_random.py
@@ -30,7 +30,7 @@
 
 from pymc3.aesaraf import change_rv_size, floatX, intX
 from pymc3.distributions.dist_math import clipped_beta_rvs
-from pymc3.distributions.distribution import to_tuple
+from pymc3.distributions.shape_utils import to_tuple
 from pymc3.exceptions import ShapeError
 from pymc3.tests.helpers import SeededTest
 from pymc3.tests.test_distributions import (

From f1b4da96510e0b55cfd3d8e90f7eb8313f02afca Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Mon, 15 Mar 2021 01:51:05 -0500
Subject: [PATCH 08/44] Change logp_transform argument to the entire random
 variable

---
 pymc3/distributions/continuous.py | 8 ++++----
 pymc3/model.py                    | 3 ++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/pymc3/distributions/continuous.py b/pymc3/distributions/continuous.py
index 1856bcba9f..b7e34d9a48 100644
--- a/pymc3/distributions/continuous.py
+++ b/pymc3/distributions/continuous.py
@@ -110,18 +110,18 @@ class BoundedContinuous(Continuous):
 
 
 @logp_transform.register(PositiveContinuous)
-def pos_cont_transform(op, inputs):
+def pos_cont_transform(op, rv_var):
     return transforms.log
 
 
 @logp_transform.register(UnitContinuous)
-def unit_cont_transform(op, inputs):
+def unit_cont_transform(op, rv_var):
     return transforms.logodds
 
 
 @logp_transform.register(BoundedContinuous)
-def bounded_cont_transform(op, inputs):
-    _, _, _, lower, upper = inputs
+def bounded_cont_transform(op, rv_var):
+    _, _, _, lower, upper = rv_var.owner.inputs
     lower = aet.as_tensor_variable(lower) if lower is not None else None
     upper = aet.as_tensor_variable(upper) if upper is not None else None
 
diff --git a/pymc3/model.py b/pymc3/model.py
index 9b9601578c..4bc2c7ab24 100644
--- a/pymc3/model.py
+++ b/pymc3/model.py
@@ -874,7 +874,8 @@ def register_rv(self, rv_var, name, data=None, total_size=None, dims=None, trans
             self.free_RVs.append(rv_var)
             value_var = rv_var.clone()
 
-            transform = transform or logp_transform(rv_var.owner.op, rv_var.owner.inputs)
+            transform = transform or logp_transform(rv_var.owner.op, rv_var)
+
             if transform is not None:
                 value_var.tag.transform = transform
                 value_var.name = f"{rv_var.name}_{transform.name}"

From 207fc0671ed11b90a18a1f206a800c118c5eaf95 Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Mon, 15 Mar 2021 01:54:31 -0500
Subject: [PATCH 09/44] Remove logpt transformed option

---
 pymc3/distributions/__init__.py | 25 +++++++++----------------
 pymc3/model.py                  | 16 ++++------------
 2 files changed, 13 insertions(+), 28 deletions(-)

diff --git a/pymc3/distributions/__init__.py b/pymc3/distributions/__init__.py
index 241b2b54a7..a87080ac44 100644
--- a/pymc3/distributions/__init__.py
+++ b/pymc3/distributions/__init__.py
@@ -234,7 +234,6 @@ def logpt(
     rv_var: TensorVariable,
     rv_value: Optional[TensorVariable] = None,
     jacobian: Optional[bool] = True,
-    transformed: Optional[bool] = True,
     scaling: Optional[bool] = True,
     **kwargs,
 ) -> TensorVariable:
@@ -256,8 +255,6 @@ def logpt(
         when available, used.
     jacobian
         Whether or not to include the Jacobian term.
-    transformed
-        Return the transformed version of the log-likelihood graph.
     scaling
         A scaling term to apply to the generated log-likelihood graph.
 
@@ -310,27 +307,28 @@ def logpt(
 
     dist_params, replacements = sample_to_measure_vars(dist_params)
 
-    logp_var = _logp(rv_node.op, rv_value_var, *dist_params, **kwargs)
+    transform = getattr(rv_value_var.tag, "transform", None)
 
     # If any of the measure vars are transformed measure-space variables
     # (signified by having a `transform` value in their tags), then we apply
     # the their transforms and add their Jacobians (when enabled)
-    if transformed:
+    if transform:
+        logp_var = _logp(rv_node.op, transform.backward(rv_value_var), *dist_params, **kwargs)
         logp_var = transform_logp(
             logp_var,
-            tuple(replacements.values()) + (rv_value_var,),
+            tuple(replacements.values()),
         )
 
-        transform = getattr(rv_value_var.tag, "transform", None)
-
-        if transform and jacobian:
+        if jacobian:
             transformed_jacobian = transform.jacobian_det(rv_value_var)
             if transformed_jacobian:
                 if logp_var.ndim > transformed_jacobian.ndim:
                     logp_var = logp_var.sum(axis=-1)
                 logp_var += transformed_jacobian
+    else:
+        logp_var = _logp(rv_node.op, rv_value_var, *dist_params, **kwargs)
 
-        (logp_var,) = clone_replace([logp_var], replace={rv_value_var: rv_value})
+    (logp_var,) = clone_replace([logp_var], replace={rv_value_var: rv_value})
 
     if scaling:
         logp_var *= _get_scaling(
@@ -374,7 +372,7 @@ def _logp(op, value, *dist_params, **kwargs):
     return aet.zeros_like(value)
 
 
-def logcdf(rv_var, rv_value, transformed=True, jacobian=True, **kwargs):
+def logcdf(rv_var, rv_value, jacobian=True, **kwargs):
     """Create a log-CDF graph."""
 
     rv_var, rv_value = rv_log_likelihood_args(rv_var)
@@ -389,11 +387,6 @@ def logcdf(rv_var, rv_value, transformed=True, jacobian=True, **kwargs):
 
     logp_var = _logcdf(rv_node.op, rv_value, *dist_params, **kwargs)
 
-    if transformed:
-        logp_var = transform_logp(
-            logp_var, tuple(replacements.values()) + (rv_value,), jacobian=jacobian
-        )
-
     return logp_var
 
 
diff --git a/pymc3/model.py b/pymc3/model.py
index 4bc2c7ab24..1cb6e29c0a 100644
--- a/pymc3/model.py
+++ b/pymc3/model.py
@@ -680,7 +680,7 @@ def logp_dlogp_function(self, grad_vars=None, tempered=False, **kwargs):
             with self:
                 free_RVs_logp = aet.sum(
                     [
-                        aet.sum(logpt(var, getattr(var.tag, "value_var", None), transformed=True))
+                        aet.sum(logpt(var, getattr(var.tag, "value_var", None)))
                         for var in self.free_RVs + self.potentials
                     ]
                 )
@@ -698,10 +698,7 @@ def logp_dlogp_function(self, grad_vars=None, tempered=False, **kwargs):
     def logpt(self):
         """Aesara scalar of log-probability of the model"""
         with self:
-            factors = [
-                logpt_sum(var, getattr(var.tag, "value_var", None), transformed=True)
-                for var in self.free_RVs
-            ]
+            factors = [logpt_sum(var, getattr(var.tag, "value_var", None)) for var in self.free_RVs]
             factors += [logpt_sum(obs) for obs in self.observed_RVs]
             factors += self.potentials
             logp_var = aet.sum([aet.sum(factor) for factor in factors])
@@ -721,9 +718,7 @@ def logp_nojact(self):
         """
         with self:
             factors = [
-                logpt_sum(
-                    var, getattr(var.tag, "value_var", None), jacobian=False, transformed=True
-                )
+                logpt_sum(var, getattr(var.tag, "value_var", None), jacobian=False)
                 for var in self.free_RVs
             ]
             factors += [logpt_sum(obs, jacobian=False) for obs in self.observed_RVs]
@@ -740,10 +735,7 @@ def varlogpt(self):
         """Aesara scalar of log-probability of the unobserved random variables
         (excluding deterministic)."""
         with self:
-            factors = [
-                logpt_sum(var, getattr(var.tag, "value_var", None), transformed=True)
-                for var in self.free_RVs
-            ]
+            factors = [logpt_sum(var, getattr(var.tag, "value_var", None)) for var in self.free_RVs]
             return aet.sum(factors)
 
     @property

From 08da3cc047ebb6a726535c5b1294aca277782c6d Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Mon, 15 Mar 2021 01:55:52 -0500
Subject: [PATCH 10/44] Implement transform for Dirichlet

---
 pymc3/distributions/multivariate.py | 14 ++++++++++++--
 pymc3/tests/test_distributions.py   | 12 ++++++++++--
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/pymc3/distributions/multivariate.py b/pymc3/distributions/multivariate.py
index 153fdf156e..fdccd70d9a 100644
--- a/pymc3/distributions/multivariate.py
+++ b/pymc3/distributions/multivariate.py
@@ -40,7 +40,7 @@
 import pymc3 as pm
 
 from pymc3.aesaraf import floatX, intX
-from pymc3.distributions import _logp, transforms
+from pymc3.distributions import _logp, logp_transform, transforms
 from pymc3.distributions.continuous import ChiSquared, Normal
 from pymc3.distributions.dist_math import bound, factln, logpow
 from pymc3.distributions.distribution import Continuous, Discrete
@@ -393,7 +393,6 @@ class Dirichlet(Continuous):
     """
 
     rv_op = dirichlet
-    default_transform = transforms.stick_breaking
 
     @classmethod
     def dist(cls, a, **kwargs):
@@ -408,6 +407,17 @@ def _distr_parameters_for_repr(self):
         return ["a"]
 
 
+@logp_transform.register(DirichletRV)
+def dirichlet_transform(op, rv_var):
+
+    if rv_var.ndim == 1 or rv_var.broadcastable[-1]:
+        # If this variable is just a bunch of scalars/degenerate
+        # Dirichlets, we can't transform it
+        return None
+
+    return transforms.stick_breaking
+
+
 @_logp.register(DirichletRV)
 def dirichlet_logp(op, value, a):
     """
diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
index d685eaec5d..b59889804c 100644
--- a/pymc3/tests/test_distributions.py
+++ b/pymc3/tests/test_distributions.py
@@ -1962,14 +1962,22 @@ def test_dirichlet_with_batch_shapes(self, dist_shape):
         with pm.Model() as model:
             d = pm.Dirichlet("d", a=a)
 
-        pymc3_res = logpt(d, d.tag.test_value).eval()
+        d_value = d.tag.value_var
+        d_point = d.eval()
+        if hasattr(d_value.tag, "transform"):
+            d_point_trans = d_value.tag.transform.forward(d_point).eval()
+        else:
+            d_point_trans = d_point
+
+        pymc3_res = logpt(d, d_point_trans, jacobian=False).eval()
+        scipy_res = np.empty_like(pymc3_res)
         for idx in np.ndindex(a.shape[:-1]):
             scipy_res[idx] = scipy.stats.dirichlet(a[idx]).logpdf(d_point[idx])
 
         assert_almost_equal(pymc3_res, scipy_res)
 
     def test_dirichlet_shape(self):
-        a = at.as_tensor_variable(np.r_[1, 2])
+        a = aet.as_tensor_variable(np.r_[1, 2])
         dir_rv = Dirichlet.dist(a)
         assert dir_rv.shape.eval() == (2,)
 

From af6ffacd08e3454caae510290bb2b595c6b88f14 Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Mon, 15 Mar 2021 01:56:43 -0500
Subject: [PATCH 11/44] Always use the value var to initially build the
 log-likelihood

---
 pymc3/distributions/__init__.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/pymc3/distributions/__init__.py b/pymc3/distributions/__init__.py
index a87080ac44..a7be6d0a7d 100644
--- a/pymc3/distributions/__init__.py
+++ b/pymc3/distributions/__init__.py
@@ -267,6 +267,9 @@ def logpt(
     else:
         rv_value = aet.as_tensor(rv_value)
 
+    if rv_value_var is None:
+        rv_value_var = rv_value
+
     rv_node = rv_var.owner
 
     if not rv_node:
@@ -298,9 +301,6 @@ def logpt(
 
         return aet.zeros_like(rv_var)
 
-    if rv_value_var is None:
-        raise NotImplementedError(f"The log-likelihood for {rv_var} is undefined")
-
     # This case should be reached when `rv_var` is either the result of an
     # `Observed` or a `RandomVariable` `Op`
     rng, size, dtype, *dist_params = rv_node.inputs
@@ -375,12 +375,14 @@ def _logp(op, value, *dist_params, **kwargs):
 def logcdf(rv_var, rv_value, jacobian=True, **kwargs):
     """Create a log-CDF graph."""
 
-    rv_var, rv_value = rv_log_likelihood_args(rv_var)
+    rv_var, _ = rv_log_likelihood_args(rv_var)
     rv_node = rv_var.owner
 
     if not rv_node:
         raise TypeError()
 
+    rv_value = aet.as_tensor(rv_value)
+
     rng, size, dtype, *dist_params = rv_node.inputs
 
     dist_params, replacements = sample_to_measure_vars(dist_params)

From 121c5172ebfcf223e50c52882789902dfba75990 Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Mon, 15 Mar 2021 01:58:07 -0500
Subject: [PATCH 12/44] Add an option for negative support assertions in Normal
 and Gamma classes

---
 pymc3/distributions/continuous.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/pymc3/distributions/continuous.py b/pymc3/distributions/continuous.py
index b7e34d9a48..0aa17510eb 100644
--- a/pymc3/distributions/continuous.py
+++ b/pymc3/distributions/continuous.py
@@ -466,7 +466,7 @@ class Normal(Continuous):
     rv_op = normal
 
     @classmethod
-    def dist(cls, mu=0, sigma=None, tau=None, sd=None, **kwargs):
+    def dist(cls, mu=0, sigma=None, tau=None, sd=None, no_assert=False, **kwargs):
         if sd is not None:
             sigma = sd
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
@@ -477,7 +477,9 @@ def dist(cls, mu=0, sigma=None, tau=None, sd=None, **kwargs):
         # mean = median = mode = mu = aet.as_tensor_variable(floatX(mu))
         # variance = 1.0 / self.tau
 
-        assert_negative_support(sigma, "sigma", "Normal")
+        if not no_assert:
+            assert_negative_support(sigma, "sigma", "Normal")
+
         return super().dist([mu, sigma], **kwargs)
 
 
@@ -2470,7 +2472,7 @@ class Gamma(PositiveContinuous):
     rv_op = gamma
 
     @classmethod
-    def dist(cls, alpha=None, beta=None, mu=None, sigma=None, sd=None, *args, **kwargs):
+    def dist(cls, alpha=None, beta=None, mu=None, sigma=None, sd=None, no_assert=False, **kwargs):
         if sd is not None:
             sigma = sd
 
@@ -2485,7 +2487,7 @@ def dist(cls, alpha=None, beta=None, mu=None, sigma=None, sd=None, *args, **kwar
             assert_negative_support(alpha, "alpha", "Gamma")
             assert_negative_support(beta, "beta", "Gamma")
 
-        return super().dist([alpha, beta], **kwargs)
+        return super().dist([alpha, aet.inv(beta)], **kwargs)
 
     @classmethod
     def get_alpha_beta(cls, alpha=None, beta=None, mu=None, sigma=None):

From a5b16906321cc0ce2fefc9f5bae90b8a20e98cf4 Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Mon, 15 Mar 2021 02:11:58 -0500
Subject: [PATCH 13/44] Fix Categorical logp implementation

---
 pymc3/distributions/discrete.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py
index f897f432e8..a2076283f7 100644
--- a/pymc3/distributions/discrete.py
+++ b/pymc3/distributions/discrete.py
@@ -1351,7 +1351,7 @@ def dist(cls, p, **kwargs):
 
 
 @_logp.register(CategoricalRV)
-def categorical_logp(op, value, p, upper):
+def categorical_logp(op, value, p):
     r"""
     Calculate log-probability of Categorical distribution at specified value.
 
@@ -1362,8 +1362,9 @@ def categorical_logp(op, value, p, upper):
         values are desired the values must be provided in a numpy array or `TensorVariable`
 
     """
+    k = aet.shape(p)[-1]
+    p_ = p
     p = p_ / aet.sum(p_, axis=-1, keepdims=True)
-    k = aet.shape(p_)[-1]
     value_clip = aet.clip(value, 0, k - 1)
 
     if p.ndim > 1:

From 3d4a8b42dfa61bd059fea1759a994b7c3c19f9d6 Mon Sep 17 00:00:00 2001
From: kc611 <ckaustubhm06@gmail.com>
Date: Mon, 15 Mar 2021 22:48:54 +0530
Subject: [PATCH 14/44] Refactored distributions in
 pymc.distributions.continuous

---
 pymc3/distributions/continuous.py | 625 ++++++++++++++----------------
 pymc3/tests/test_distributions.py |  17 +-
 2 files changed, 294 insertions(+), 348 deletions(-)

diff --git a/pymc3/distributions/continuous.py b/pymc3/distributions/continuous.py
index 0aa17510eb..e82e70a9c4 100644
--- a/pymc3/distributions/continuous.py
+++ b/pymc3/distributions/continuous.py
@@ -24,10 +24,22 @@
 
 from aesara.assert_op import Assert
 from aesara.tensor.random.basic import (
+    BetaRV,
+    CauchyRV,
+    ExponentialRV,
     GammaRV,
+    HalfCauchyRV,
+    HalfNormalRV,
+    InvGammaRV,
     NormalRV,
     UniformRV,
+    beta,
+    cauchy,
+    exponential,
     gamma,
+    halfcauchy,
+    halfnormal,
+    invgamma,
     normal,
     uniform,
 )
@@ -95,6 +107,8 @@
 uniform.inplace = True
 gamma = copy(gamma)
 gamma.inplace = True
+beta = copy(beta)
+beta.inplace = True
 
 
 class PositiveContinuous(Continuous):
@@ -812,73 +826,66 @@ def dist(cls, sigma=None, tau=None, sd=None, *args, **kwargs):
 
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
 
+        # sigma = sd = sigma = aet.as_tensor_variable(sigma)
+        # tau = tau = aet.as_tensor_variable(tau)
+
+        # mean = aet.sqrt(2 / (np.pi * tau))
+        # variance = (1.0 - 2 / np.pi) / tau
+
         assert_negative_support(tau, "tau", "HalfNormal")
         assert_negative_support(sigma, "sigma", "HalfNormal")
 
-        return super().dist([0.0, sigma], **kwargs)
+        return super().dist([sigma, tau], **kwargs)
 
-        Parameters
-        ----------
-        point: dict, optional
-            Dict of variable values on which random values are to be
-            conditioned (uses default point if not specified).
-        size: int, optional
-            Desired size of random sample (returns one sample if not
-            specified).
+    def _distr_parameters_for_repr(self):
+        return ["sigma"]
 
-        Returns
-        -------
-        array
-        """
-        # sigma = draw_values([self.sigma], point=point, size=size)[0]
-        # return generate_samples(
-        #     stats.halfnorm.rvs, loc=0.0, scale=sigma, dist_shape=self.shape, size=size
-        # )
 
-    def logp(self, value):
-        """
-        Calculate log-probability of HalfNormal distribution at specified value.
+@_logp.register(HalfNormalRV)
+def halfnormal_logp(op, value, sigma, tau):
+    """
+    Calculate log-probability of HalfNormal distribution at specified value.
 
-        Parameters
-        ----------
-        value: numeric
-            Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or aesara tensor
+    Parameters
+    ----------
+    value: numeric
+        Value(s) for which log-probability is calculated. If the log probabilities for multiple
+        values are desired the values must be provided in a numpy array or aesara tensor
 
-        Returns
-        -------
-        TensorVariable
-        """
-        tau, sigma = get_tau_sigma(tau=None, sigma=sigma)
+    Returns
+    -------
+    TensorVariable
+    """
+    return bound(
+        -0.5 * tau * value ** 2 + 0.5 * aet.log(tau * 2.0 / np.pi),
+        value >= 0,
+        tau > 0,
+        sigma > 0,
+    )
 
-        return bound(
-            -0.5 * tau * (value - loc) ** 2 + 0.5 * at.log(tau * 2.0 / np.pi),
-            value >= loc,
-            tau > 0,
-            sigma > 0,
-        )
 
-    def logcdf(value, loc, sigma):
-        """
-        Compute the log of the cumulative distribution function for HalfNormal distribution
-        at the specified value.
+@_logcdf.register(HalfNormalRV)
+def halfnormal_logcdf(op, value, sigma, tau):
+    """
+    Compute the log of the cumulative distribution function for HalfNormal distribution
+    at the specified value.
 
-        Parameters
-        ----------
-        value: numeric or np.ndarray or aesara.tensor
-            Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or aesara tensor.
+    Parameters
+    ----------
+    value: numeric or np.ndarray or aesara.tensor
+        Value(s) for which log CDF is calculated. If the log CDF for multiple
+        values are desired the values must be provided in a numpy array or aesara tensor.
 
-        Returns
-        -------
-        TensorVariable
-        """
-        z = zvalue(value, mu=loc, sigma=sigma)
-        return bound(
-            at.log1p(-at.erfc(z / at.sqrt(2.0))),
-            loc <= value,
-            0 < sigma,
-        )
+    Returns
+    -------
+    TensorVariable
+    """
+    z = zvalue(value, mu=0, sigma=sigma)
+    return bound(
+        aet.log1p(-aet.erfc(z / aet.sqrt(2.0))),
+        0 <= value,
+        0 < sigma,
+    )
 
     def _distr_parameters_for_repr(self):
         return ["sigma"]
@@ -1187,8 +1194,11 @@ def dist(cls, alpha=None, beta=None, mu=None, sigma=None, sd=None, *args, **kwar
             sigma = sd
 
         alpha, beta = cls.get_alpha_beta(alpha, beta, mu, sigma)
-        alpha = at.as_tensor_variable(floatX(alpha))
-        beta = at.as_tensor_variable(floatX(beta))
+        alpha = aet.as_tensor_variable(floatX(alpha))
+        beta = aet.as_tensor_variable(floatX(beta))
+
+        mean = alpha / (alpha + beta)
+        variance = (alpha * beta) / ((alpha + beta) ** 2 * (alpha + beta + 1))
 
         assert_negative_support(alpha, "alpha", "Beta")
         assert_negative_support(beta, "beta", "Beta")
@@ -1211,82 +1221,69 @@ def get_alpha_beta(self, alpha=None, beta=None, mu=None, sigma=None):
 
         return alpha, beta
 
-    def random(self, point=None, size=None):
-        """
-        Draw random values from Beta distribution.
-
-        Parameters
-        ----------
-        point: dict, optional
-            Dict of variable values on which random values are to be
-            conditioned (uses default point if not specified).
-        size: int, optional
-            Desired size of random sample (returns one sample if not
-            specified).
+    def _distr_parameters_for_repr(self):
+        return ["alpha", "beta"]
 
-        Returns
-        -------
-        array
-        """
-        # alpha, beta = draw_values([self.alpha, self.beta], point=point, size=size)
-        # return generate_samples(clipped_beta_rvs, alpha, beta, dist_shape=self.shape, size=size)
 
-    def logp(value, alpha, beta):
-        """
-        Calculate log-probability of Beta distribution at specified value.
+@_logp.register(BetaRV)
+def beta_logp(op, value, alpha, beta):
+    """
+    Calculate log-probability of Beta distribution at specified value.
 
-        Parameters
-        ----------
-        value: numeric
-            Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or aesara tensor
+    Parameters
+    ----------
+    value: numeric
+        Value(s) for which log-probability is calculated. If the log probabilities for multiple
+        values are desired the values must be provided in a numpy array or aesara tensor
 
-        Returns
-        -------
-        TensorVariable
-        """
+    Returns
+    -------
+    TensorVariable
+    """
 
-        logval = at.log(value)
-        log1pval = at.log1p(-value)
-        logp = (
-            at.switch(at.eq(alpha, 1), 0, (alpha - 1) * logval)
-            + at.switch(at.eq(beta, 1), 0, (beta - 1) * log1pval)
-            - betaln(alpha, beta)
-        )
+    logval = aet.log(value)
+    log1pval = aet.log1p(-value)
+    logp = (
+        aet.switch(aet.eq(alpha, 1), 0, (alpha - 1) * logval)
+        + aet.switch(aet.eq(beta, 1), 0, (beta - 1) * log1pval)
+        - betaln(alpha, beta)
+    )
 
-        return bound(logp, value >= 0, value <= 1, alpha > 0, beta > 0)
+    return bound(logp, value >= 0, value <= 1, alpha > 0, beta > 0)
 
-    def logcdf(value, alpha, beta):
-        """
-        Compute the log of the cumulative distribution function for Beta distribution
-        at the specified value.
 
-        Parameters
-        ----------
-        value: numeric
-            Value(s) for which log CDF is calculated.
+@_logcdf.register(BetaRV)
+def beta_logcdf(op, value, alpha, beta):
+    """
+    Compute the log of the cumulative distribution function for Beta distribution
+    at the specified value.
 
-        Returns
-        -------
-        TensorVariable
-        """
-        # incomplete_beta function can only handle scalar values (see #4342)
-        if np.ndim(value):
-            raise TypeError(
-                f"Beta.logcdf expects a scalar value but received a {np.ndim(value)}-dimensional object."
-            )
+    Parameters
+    ----------
+    value: numeric
+        Value(s) for which log CDF is calculated.
 
-        return bound(
-            at.switch(
-                at.lt(value, 1),
-                at.log(incomplete_beta(alpha, beta, value)),
-                0,
-            ),
-            0 <= value,
-            0 < alpha,
-            0 < beta,
+    Returns
+    -------
+    TensorVariable
+    """
+    # incomplete_beta function can only handle scalar values (see #4342)
+    if np.ndim(value):
+        raise TypeError(
+            f"Beta.logcdf expects a scalar value but received a {np.ndim(value)}-dimensional object."
         )
 
+    return bound(
+        aet.switch(
+            aet.lt(value, 1),
+            aet.log(incomplete_beta(alpha, beta, value)),
+            0,
+        ),
+        0 <= value,
+        0 < alpha,
+        0 < beta,
+    )
+
 
 class Kumaraswamy(UnitContinuous):
     r"""
@@ -1434,75 +1431,57 @@ class Exponential(PositiveContinuous):
 
     @classmethod
     def dist(cls, lam, *args, **kwargs):
-        lam = at.as_tensor_variable(floatX(lam))
+        lam = aet.as_tensor_variable(floatX(lam))
         # mean = 1.0 / lam
-        # median = mean * at.log(2)
-        # mode = at.zeros_like(lam)
+        # median = mean * aet.log(2)
+        # mode = aet.zeros_like(lam)
 
         # variance = lam ** -2
 
         assert_negative_support(lam, "lam", "Exponential")
         return super().dist([lam], **kwargs)
 
-    def random(self, point=None, size=None):
-        """
-        Draw random values from Exponential distribution.
-
-        Parameters
-        ----------
-        point: dict, optional
-            Dict of variable values on which random values are to be
-            conditioned (uses default point if not specified).
-        size: int, optional
-            Desired size of random sample (returns one sample if not
-            specified).
 
-        Returns
-        -------
-        array
-        """
-        # lam = draw_values([self.lam], point=point, size=size)[0]
-        # return generate_samples(
-        #     np.random.exponential, scale=1.0 / lam, dist_shape=self.shape, size=size
-        # )
+@_logp.register(ExponentialRV)
+def exponential_logp(op, value, lam):
+    """
+    Calculate log-probability of Exponential distribution at specified value.
 
-    def logp(self, value):
-        """
-        Calculate log-probability of Exponential distribution at specified value.
+    Parameters
+    ----------
+    value: numeric
+        Value(s) for which log-probability is calculated. If the log probabilities for multiple
+        values are desired the values must be provided in a numpy array or aesara tensor
 
-        Parameters
-        ----------
-        value: numeric
-            Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or aesara tensor
+    Returns
+    -------
+    TensorVariable
+    """
+    return bound(aet.log(lam) - lam * value, value >= 0, lam > 0)
 
-        Returns
-        -------
-        TensorVariable
-        """
-        return bound(at.log(lam) - lam * value, value >= 0, lam > 0)
 
-    def logcdf(value, lam):
-        r"""
-        Compute the log of cumulative distribution function for the Exponential distribution
-        at the specified value.
+@_logcdf.register(ExponentialRV)
+def exponential_logcdf(op, value, lam):
+    r"""
+    Compute the log of cumulative distribution function for the Exponential distribution
+    at the specified value.
 
-        Parameters
-        ----------
-        value: numeric or np.ndarray or aesara.tensor
-            Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or aesara tensor.
+    Parameters
+    ----------
+    value: numeric or np.ndarray or aesara.tensor
+        Value(s) for which log CDF is calculated. If the log CDF for multiple
+        values are desired the values must be provided in a numpy array or aesara tensor.
 
-        Returns
-        -------
-        TensorVariable
-        """
-        a = lam * value
-        return bound(
-            log1mexp(a),
-            0 <= value,
-            0 <= lam,
-        )
+    Returns
+    -------
+    TensorVariable
+    """
+    a = lam * value
+    return bound(
+        log1mexp(a),
+        0 <= value,
+        0 <= lam,
+    )
 
 
 class Laplace(Continuous):
@@ -2247,56 +2226,56 @@ class Cauchy(Continuous):
 
     @classmethod
     def dist(cls, alpha, beta, *args, **kwargs):
-        alpha = at.as_tensor_variable(floatX(alpha))
-        beta = at.as_tensor_variable(floatX(beta))
+        alpha = aet.as_tensor_variable(floatX(alpha))
+        beta = aet.as_tensor_variable(floatX(beta))
 
         # median = alpha
         # mode = alpha
 
-        Returns
-        -------
-        array
-        """
-        # alpha, beta = draw_values([self.alpha, self.beta], point=point, size=size)
-        # return generate_samples(self._random, alpha, beta, dist_shape=self.shape, size=size)
+        assert_negative_support(beta, "beta", "Cauchy")
+        return super().dist([alpha, beta], **kwargs)
 
-    def logp(value, alpha, beta):
-        """
-        Calculate log-probability of Cauchy distribution at specified value.
 
-        Parameters
-        ----------
-        value: numeric
-            Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or aesara tensor
+@_logp.register(CauchyRV)
+def cauchy_logp(op, value, alpha, beta):
+    """
+    Calculate log-probability of Cauchy distribution at specified value.
 
-        Returns
-        -------
-        TensorVariable
-        """
-        return bound(
-            -at.log(np.pi) - at.log(beta) - at.log1p(((value - alpha) / beta) ** 2), beta > 0
-        )
+    Parameters
+    ----------
+    value: numeric
+        Value(s) for which log-probability is calculated. If the log probabilities for multiple
+        values are desired the values must be provided in a numpy array or aesara tensor
 
-    def logcdf(value, alpha, beta):
-        """
-        Compute the log of the cumulative distribution function for Cauchy distribution
-        at the specified value.
+    Returns
+    -------
+    TensorVariable
+    """
+    return bound(
+        -aet.log(np.pi) - aet.log(beta) - aet.log1p(((value - alpha) / beta) ** 2), beta > 0
+    )
 
-        Parameters
-        ----------
-        value: numeric or np.ndarray or aesara.tensor
-            Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or aesara tensor.
 
-        Returns
-        -------
-        TensorVariable
-        """
-        return bound(
-            at.log(0.5 + at.arctan((value - alpha) / beta) / np.pi),
-            0 < beta,
-        )
+@_logcdf.register(CauchyRV)
+def cauchy_logcdf(op, value, alpha, beta):
+    """
+    Compute the log of the cumulative distribution function for Cauchy distribution
+    at the specified value.
+
+    Parameters
+    ----------
+    value: numeric or np.ndarray or aesara.tensor
+        Value(s) for which log CDF is calculated. If the log CDF for multiple
+        values are desired the values must be provided in a numpy array or aesara tensor.
+
+    Returns
+    -------
+    TensorVariable
+    """
+    return bound(
+        aet.log(0.5 + aet.arctan((value - alpha) / beta) / np.pi),
+        0 < beta,
+    )
 
 
 class HalfCauchy(PositiveContinuous):
@@ -2341,74 +2320,58 @@ class HalfCauchy(PositiveContinuous):
 
     @classmethod
     def dist(cls, beta, *args, **kwargs):
-        beta = at.as_tensor_variable(floatX(beta))
-        assert_negative_support(beta, "beta", "HalfCauchy")
-        return super().dist([0.0, beta], **kwargs)
+        beta = aet.as_tensor_variable(floatX(beta))
 
-    def _random(self, beta, size=None):
-        u = np.random.uniform(size=size)
-        return beta * np.abs(np.tan(np.pi * (u - 0.5)))
+        # mode = aet.as_tensor_variable(0)
+        # median = beta
 
-    def random(self, point=None, size=None):
-        """
-        Draw random values from HalfCauchy distribution.
+        assert_negative_support(beta, "beta", "HalfCauchy")
+        return super().dist([beta], **kwargs)
 
-        Parameters
-        ----------
-        point: dict, optional
-            Dict of variable values on which random values are to be
-            conditioned (uses default point if not specified).
-        size: int, optional
-            Desired size of random sample (returns one sample if not
-            specified).
 
-        Returns
-        -------
-        array
-        """
-        # beta = draw_values([self.beta], point=point, size=size)[0]
-        # return generate_samples(self._random, beta, dist_shape=self.shape, size=size)
+@_logp.register(HalfCauchyRV)
+def half_cauchy_logp(op, value, beta, alpha):
+    """
+    Calculate log-probability of HalfCauchy distribution at specified value.
 
-    def logp(self, value):
-        """
-        Calculate log-probability of HalfCauchy distribution at specified value.
+    Parameters
+    ----------
+    value: numeric
+        Value(s) for which log-probability is calculated. If the log probabilities for multiple
+        values are desired the values must be provided in a numpy array or aesara tensor
 
-        Parameters
-        ----------
-        value: numeric
-            Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or aesara tensor
+    Returns
+    -------
+    TensorVariable
+    """
+    return bound(
+        aet.log(2) - aet.log(np.pi) - aet.log(beta) - aet.log1p((value / beta) ** 2),
+        value >= 0,
+        beta > 0,
+    )
 
-        Returns
-        -------
-        TensorVariable
-        """
-        return bound(
-            at.log(2) - at.log(np.pi) - at.log(beta) - at.log1p(((value - loc) / beta) ** 2),
-            value >= loc,
-            beta > 0,
-        )
 
-    def logcdf(value, loc, beta):
-        """
-        Compute the log of the cumulative distribution function for HalfCauchy distribution
-        at the specified value.
+@_logcdf.register(HalfCauchyRV)
+def half_cauchy_logcdf(op, value, beta, alpha):
+    """
+    Compute the log of the cumulative distribution function for HalfCauchy distribution
+    at the specified value.
 
-        Parameters
-        ----------
-        value: numeric or np.ndarray or aesara.tensor
-            Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or aesara tensor.
+    Parameters
+    ----------
+    value: numeric or np.ndarray or aesara.tensor
+        Value(s) for which log CDF is calculated. If the log CDF for multiple
+        values are desired the values must be provided in a numpy array or aesara tensor.
 
-        Returns
-        -------
-        TensorVariable
-        """
-        return bound(
-            at.log(2 * at.arctan((value - loc) / beta) / np.pi),
-            loc <= value,
-            0 < beta,
-        )
+    Returns
+    -------
+    TensorVariable
+    """
+    return bound(
+        aet.log(2 * aet.arctan(value / beta) / np.pi),
+        0 <= value,
+        0 < beta,
+    )
 
 
 class Gamma(PositiveContinuous):
@@ -2617,8 +2580,8 @@ def dist(cls, alpha=None, beta=None, mu=None, sigma=None, sd=None, *args, **kwar
             sigma = sd
 
         alpha, beta = cls._get_alpha_beta(alpha, beta, mu, sigma)
-        alpha = at.as_tensor_variable(floatX(alpha))
-        beta = at.as_tensor_variable(floatX(beta))
+        alpha = aet.as_tensor_variable(floatX(alpha))
+        beta = aet.as_tensor_variable(floatX(beta))
 
         # m = beta / (alpha - 1.0)
         # try:
@@ -2628,8 +2591,8 @@ def dist(cls, alpha=None, beta=None, mu=None, sigma=None, sd=None, *args, **kwar
         #     mean = m
 
         # mode = beta / (alpha + 1.0)
-        # variance = at.switch(
-        #     at.gt(alpha, 2), (beta ** 2) / ((alpha - 2) * (alpha - 1.0) ** 2), np.inf
+        # variance = aet.switch(
+        #     aet.gt(alpha, 2), (beta ** 2) / ((alpha - 2) * (alpha - 1.0) ** 2), np.inf
         # )
 
         assert_negative_support(alpha, "alpha", "InverseGamma")
@@ -2656,75 +2619,61 @@ def _get_alpha_beta(cls, alpha, beta, mu, sigma):
 
         return alpha, beta
 
-    def random(self, point=None, size=None):
-        """
-        Draw random values from InverseGamma distribution.
+    @classmethod
+    def _distr_parameters_for_repr(self):
+        return ["alpha", "beta"]
 
-        Parameters
-        ----------
-        point: dict, optional
-            Dict of variable values on which random values are to be
-            conditioned (uses default point if not specified).
-        size: int, optional
-            Desired size of random sample (returns one sample if not
-            specified).
 
-        Returns
-        -------
-        array
-        """
-        # alpha, beta = draw_values([self.alpha, self.beta], point=point, size=size)
-        # return generate_samples(
-        #     stats.invgamma.rvs, a=alpha, scale=beta, dist_shape=self.shape, size=size
-        # )
+@_logp.register(InvGammaRV)
+def inv_gamma_logp(op, value, alpha, beta):
+    """
+    Calculate log-probability of InverseGamma distribution at specified value.
 
-    def logp(value, alpha, beta):
-        """
-        Calculate log-probability of InverseGamma distribution at specified value.
+    Parameters
+    ----------
+    value: numeric
+        Value(s) for which log-probability is calculated. If the log probabilities for multiple
+        values are desired the values must be provided in a numpy array or aesara tensor
 
-        Parameters
-        ----------
-        value: numeric
-            Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or aesara tensor
+    Returns
+    -------
+    TensorVariable
+    """
+    return bound(
+        logpow(beta, alpha) - gammaln(alpha) - beta / value + logpow(value, -alpha - 1),
+        value > 0,
+        alpha > 0,
+        beta > 0,
+    )
 
-        Returns
-        -------
-        TensorVariable
-        """
-        return bound(
-            logpow(beta, alpha) - gammaln(alpha) - beta / value + logpow(value, -alpha - 1),
-            value > 0,
-            alpha > 0,
-            beta > 0,
-        )
 
-    def logcdf(value, alpha, beta):
-        """
-        Compute the log of the cumulative distribution function for Inverse Gamma distribution
-        at the specified value.
+@_logcdf.register(InvGammaRV)
+def inv_gamma_logcdf(op, value, alpha, beta):
+    """
+    Compute the log of the cumulative distribution function for Inverse Gamma distribution
+    at the specified value.
 
-        Parameters
-        ----------
-        value: numeric or np.ndarray or aesara.tensor
-            Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or aesara tensor.
+    Parameters
+    ----------
+    value: numeric or np.ndarray or aesara.tensor
+        Value(s) for which log CDF is calculated. If the log CDF for multiple
+        values are desired the values must be provided in a numpy array or aesara tensor.
 
-        Returns
-        -------
-        TensorVariable
-        """
-        # Avoid C-assertion when the gammaincc function is called with invalid values (#4340)
-        safe_alpha = at.switch(at.lt(alpha, 0), 0, alpha)
-        safe_beta = at.switch(at.lt(beta, 0), 0, beta)
-        safe_value = at.switch(at.lt(value, 0), 0, value)
+    Returns
+    -------
+    TensorVariable
+    """
+    # Avoid C-assertion when the gammaincc function is called with invalid values (#4340)
+    safe_alpha = aet.switch(aet.lt(alpha, 0), 0, alpha)
+    safe_beta = aet.switch(aet.lt(beta, 0), 0, beta)
+    safe_value = aet.switch(aet.lt(value, 0), 0, value)
 
-        return bound(
-            at.log(at.gammaincc(safe_alpha, safe_beta / safe_value)),
-            0 <= value,
-            0 < alpha,
-            0 < beta,
-        )
+    return bound(
+        aet.log(aet.gammaincc(safe_alpha, safe_beta / safe_value)),
+        0 <= value,
+        0 < alpha,
+        0 < beta,
+    )
 
 
 class ChiSquared(Gamma):
diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
index b59889804c..c644c1da3b 100644
--- a/pymc3/tests/test_distributions.py
+++ b/pymc3/tests/test_distributions.py
@@ -776,7 +776,12 @@ def check_logcdf(
                     if invalid_edge is not None:
                         test_params = valid_params.copy()  # Shallow copy should be okay
                         test_params[invalid_param] = invalid_edge
-                        invalid_dist = pymc3_dist.dist(**test_params)
+                        # We need to remove `Assert`s introduced by checks like
+                        # `assert_negative_support` and disable test values;
+                        # otherwise, we won't be able to create the
+                        # `RandomVariable`
+                        with aesara.config.change_flags(compute_test_value="off"):
+                            invalid_dist = pymc3_dist.dist(**test_params)
                         with aesara.config.change_flags(mode=Mode("py")):
                             assert_equal(
                                 logcdf(invalid_dist, valid_value).eval(),
@@ -996,7 +1001,6 @@ def scipy_logp(value, mu, sigma, lower, upper):
             decimal=select_by_precision(float64=6, float32=1),
         )
 
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_half_normal(self):
         self.check_logp(
             HalfNormal,
@@ -1075,6 +1079,7 @@ def test_wald_logp_custom_points(self, value, mu, lam, phi, alpha, logp):
         decimals = select_by_precision(float64=6, float32=1)
         assert_almost_equal(model.fastlogp(pt), logp, decimal=decimals, err_msg=str(pt))
 
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_wald_logp(self):
         self.check_logp(
             Wald,
@@ -1124,7 +1129,6 @@ def scipy_log_pdf(value, a, b):
 
         self.check_logp(Kumaraswamy, Unit, {"a": Rplus, "b": Rplus}, scipy_log_pdf)
 
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_exponential(self):
         self.check_logp(
             Exponential,
@@ -1314,7 +1318,6 @@ def test_t(self):
             n_samples=10,
         )
 
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_cauchy(self):
         self.check_logp(
             Cauchy,
@@ -1329,7 +1332,6 @@ def test_cauchy(self):
             lambda value, alpha, beta: sp.cauchy.logcdf(value, alpha, beta),
         )
 
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_half_cauchy(self):
         self.check_logp(
             HalfCauchy,
@@ -1378,11 +1380,6 @@ def test_gamma_logcdf(self):
             skip_paramdomain_outside_edge_test=True,
         )
 
-    @pytest.mark.xfail(
-        condition=(aesara.config.floatX == "float32"),
-        reason="Fails on float32 due to numerical issues",
-    )
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_inverse_gamma_logp(self):
         self.check_logp(
             InverseGamma,

From 63c2688b2860f316d456fb05619ec068a72445d3 Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Mon, 15 Mar 2021 23:12:35 -0500
Subject: [PATCH 15/44] Simplify the new Distribution interface and convert a
 few more

---
 pymc3/distributions/continuous.py   | 723 +++++++++++++---------------
 pymc3/distributions/discrete.py     | 328 +++++--------
 pymc3/distributions/distribution.py |  71 ++-
 pymc3/distributions/multivariate.py | 296 +++++-------
 pymc3/tests/test_distributions.py   | 106 ++--
 5 files changed, 684 insertions(+), 840 deletions(-)

diff --git a/pymc3/distributions/continuous.py b/pymc3/distributions/continuous.py
index e82e70a9c4..aa9d45c044 100644
--- a/pymc3/distributions/continuous.py
+++ b/pymc3/distributions/continuous.py
@@ -17,22 +17,12 @@
 A collection of common probability distributions for stochastic
 nodes in PyMC.
 """
-from copy import copy
 
 import aesara.tensor as at
 import numpy as np
 
 from aesara.assert_op import Assert
 from aesara.tensor.random.basic import (
-    BetaRV,
-    CauchyRV,
-    ExponentialRV,
-    GammaRV,
-    HalfCauchyRV,
-    HalfNormalRV,
-    InvGammaRV,
-    NormalRV,
-    UniformRV,
     beta,
     cauchy,
     exponential,
@@ -47,7 +37,7 @@
 from scipy.interpolate import InterpolatedUnivariateSpline
 
 from pymc3.aesaraf import floatX
-from pymc3.distributions import _logcdf, _logp, logp_transform, transforms
+from pymc3.distributions import logp_transform, transforms
 from pymc3.distributions.dist_math import (
     SplineWrapper,
     betaln,
@@ -100,16 +90,6 @@
     "AsymmetricLaplace",
 ]
 
-# FIXME: These are temporary hacks
-normal = copy(normal)
-normal.inplace = True
-uniform = copy(uniform)
-uniform.inplace = True
-gamma = copy(gamma)
-gamma.inplace = True
-beta = copy(beta)
-beta.inplace = True
-
 
 class PositiveContinuous(Continuous):
     """Base class for positive continuous distributions"""
@@ -251,52 +231,45 @@ def dist(cls, lower=0, upper=1, **kwargs):
         # median = self.mean
         return super().dist([lower, upper], **kwargs)
 
+    def logp(value, lower, upper):
+        """
+        Calculate log-probability of Uniform distribution at specified value.
 
-BoundedContinuous.register(UniformRV)
-
-
-@_logp.register(UniformRV)
-def uniform_logp(op, value, lower, upper):
-    """
-    Calculate log-probability of Uniform distribution at specified value.
-
-    Parameters
-    ----------
-    value: numeric
-        Value for which log-probability is calculated.
-
-    Returns
-    -------
-    TensorVariable
-    """
-    return bound(-aet.log(upper - lower), value >= lower, value <= upper)
+        Parameters
+        ----------
+        value: numeric
+            Value for which log-probability is calculated.
 
+        Returns
+        -------
+        TensorVariable
+        """
+        return bound(-aet.log(upper - lower), value >= lower, value <= upper)
 
-@_logcdf.register(UniformRV)
-def uniform_logcdf(op, value, lower, upper):
-    """
-    Compute the log of the cumulative distribution function for Uniform distribution
-    at the specified value.
+    def logcdf(value, lower, upper):
+        """
+        Compute the log of the cumulative distribution function for Uniform distribution
+        at the specified value.
 
-    Parameters
-    ----------
-    value: numeric or np.ndarray or `TensorVariable`
-        Value(s) for which log CDF is calculated. If the log CDF for multiple
-        values are desired the values must be provided in a numpy array or `TensorVariable`.
+        Parameters
+        ----------
+        value: numeric or np.ndarray or `TensorVariable`
+            Value(s) for which log CDF is calculated. If the log CDF for multiple
+            values are desired the values must be provided in a numpy array or `TensorVariable`.
 
-    Returns
-    -------
-    TensorVariable
-    """
-    return aet.switch(
-        aet.lt(value, lower) | aet.lt(upper, lower),
-        -np.inf,
-        aet.switch(
-            aet.lt(value, upper),
-            aet.log(value - lower) - aet.log(upper - lower),
-            0,
-        ),
-    )
+        Returns
+        -------
+        TensorVariable
+        """
+        return aet.switch(
+            aet.lt(value, lower) | aet.lt(upper, lower),
+            -np.inf,
+            aet.switch(
+                aet.lt(value, upper),
+                aet.log(value - lower) - aet.log(upper - lower),
+                0,
+            ),
+        )
 
 
 class Flat(Continuous):
@@ -496,47 +469,43 @@ def dist(cls, mu=0, sigma=None, tau=None, sd=None, no_assert=False, **kwargs):
 
         return super().dist([mu, sigma], **kwargs)
 
+    def logp(value, mu, sigma):
+        """
+        Calculate log-probability of Normal distribution at specified value.
 
-@_logp.register(NormalRV)
-def normal_logp(op, value, mu, sigma):
-    """
-    Calculate log-probability of Normal distribution at specified value.
-
-    Parameters
-    ----------
-    value: numeric
-        Value(s) for which log-probability is calculated. If the log probabilities for multiple
-        values are desired the values must be provided in a numpy array or `TensorVariable`.
-
-    Returns
-    -------
-    TensorVariable
-    """
-    tau, sigma = get_tau_sigma(tau=None, sigma=sigma)
+        Parameters
+        ----------
+        value: numeric
+            Value(s) for which log-probability is calculated. If the log probabilities for multiple
+            values are desired the values must be provided in a numpy array or `TensorVariable`.
 
-    return bound((-tau * (value - mu) ** 2 + aet.log(tau / np.pi / 2.0)) / 2.0, sigma > 0)
+        Returns
+        -------
+        TensorVariable
+        """
+        tau, sigma = get_tau_sigma(tau=None, sigma=sigma)
 
+        return bound((-tau * (value - mu) ** 2 + aet.log(tau / np.pi / 2.0)) / 2.0, sigma > 0)
 
-@_logcdf.register(NormalRV)
-def normal_logcdf(op, value, mu, sigma):
-    """
-    Compute the log of the cumulative distribution function for Normal distribution
-    at the specified value.
+    def logcdf(value, mu, sigma):
+        """
+        Compute the log of the cumulative distribution function for Normal distribution
+        at the specified value.
 
-    Parameters
-    ----------
-    value: numeric or np.ndarray or `TensorVariable`
-        Value(s) for which log CDF is calculated. If the log CDF for multiple
-        values are desired the values must be provided in a numpy array or `TensorVariable`.
+        Parameters
+        ----------
+        value: numeric or np.ndarray or `TensorVariable`
+            Value(s) for which log CDF is calculated. If the log CDF for multiple
+            values are desired the values must be provided in a numpy array or `TensorVariable`.
 
-    Returns
-    -------
-    TensorVariable
-    """
-    return bound(
-        normal_lcdf(mu, sigma, value),
-        0 < sigma,
-    )
+        Returns
+        -------
+        TensorVariable
+        """
+        return bound(
+            normal_lcdf(mu, sigma, value),
+            0 < sigma,
+        )
 
 
 class TruncatedNormal(BoundedContinuous):
@@ -837,55 +806,51 @@ def dist(cls, sigma=None, tau=None, sd=None, *args, **kwargs):
 
         return super().dist([sigma, tau], **kwargs)
 
-    def _distr_parameters_for_repr(self):
-        return ["sigma"]
-
-
-@_logp.register(HalfNormalRV)
-def halfnormal_logp(op, value, sigma, tau):
-    """
-    Calculate log-probability of HalfNormal distribution at specified value.
+    def logp(value, sigma, tau):
+        """
+        Calculate log-probability of HalfNormal distribution at specified value.
 
-    Parameters
-    ----------
-    value: numeric
-        Value(s) for which log-probability is calculated. If the log probabilities for multiple
-        values are desired the values must be provided in a numpy array or aesara tensor
+        Parameters
+        ----------
+        value: numeric
+            Value(s) for which log-probability is calculated. If the log probabilities for multiple
+            values are desired the values must be provided in a numpy array or aesara tensor
 
-    Returns
-    -------
-    TensorVariable
-    """
-    return bound(
-        -0.5 * tau * value ** 2 + 0.5 * aet.log(tau * 2.0 / np.pi),
-        value >= 0,
-        tau > 0,
-        sigma > 0,
-    )
+        Returns
+        -------
+        TensorVariable
+        """
+        return bound(
+            -0.5 * tau * value ** 2 + 0.5 * aet.log(tau * 2.0 / np.pi),
+            value >= 0,
+            tau > 0,
+            sigma > 0,
+        )
 
+    def logcdf(value, sigma, tau):
+        """
+        Compute the log of the cumulative distribution function for HalfNormal distribution
+        at the specified value.
 
-@_logcdf.register(HalfNormalRV)
-def halfnormal_logcdf(op, value, sigma, tau):
-    """
-    Compute the log of the cumulative distribution function for HalfNormal distribution
-    at the specified value.
+        Parameters
+        ----------
+        value: numeric or np.ndarray or aesara.tensor
+            Value(s) for which log CDF is calculated. If the log CDF for multiple
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
-    Parameters
-    ----------
-    value: numeric or np.ndarray or aesara.tensor
-        Value(s) for which log CDF is calculated. If the log CDF for multiple
-        values are desired the values must be provided in a numpy array or aesara tensor.
+        Returns
+        -------
+        TensorVariable
+        """
+        z = zvalue(value, mu=0, sigma=sigma)
+        return bound(
+            aet.log1p(-aet.erfc(z / aet.sqrt(2.0))),
+            0 <= value,
+            0 < sigma,
+        )
 
-    Returns
-    -------
-    TensorVariable
-    """
-    z = zvalue(value, mu=0, sigma=sigma)
-    return bound(
-        aet.log1p(-aet.erfc(z / aet.sqrt(2.0))),
-        0 <= value,
-        0 < sigma,
-    )
+    def _distr_parameters_for_repr(self):
+        return ["sigma"]
 
     def _distr_parameters_for_repr(self):
         return ["sigma"]
@@ -1224,66 +1189,62 @@ def get_alpha_beta(self, alpha=None, beta=None, mu=None, sigma=None):
     def _distr_parameters_for_repr(self):
         return ["alpha", "beta"]
 
+    def logp(value, alpha, beta):
+        """
+        Calculate log-probability of Beta distribution at specified value.
 
-@_logp.register(BetaRV)
-def beta_logp(op, value, alpha, beta):
-    """
-    Calculate log-probability of Beta distribution at specified value.
-
-    Parameters
-    ----------
-    value: numeric
-        Value(s) for which log-probability is calculated. If the log probabilities for multiple
-        values are desired the values must be provided in a numpy array or aesara tensor
+        Parameters
+        ----------
+        value: numeric
+            Value(s) for which log-probability is calculated. If the log probabilities for multiple
+            values are desired the values must be provided in a numpy array or aesara tensor
 
-    Returns
-    -------
-    TensorVariable
-    """
+        Returns
+        -------
+        TensorVariable
+        """
 
-    logval = aet.log(value)
-    log1pval = aet.log1p(-value)
-    logp = (
-        aet.switch(aet.eq(alpha, 1), 0, (alpha - 1) * logval)
-        + aet.switch(aet.eq(beta, 1), 0, (beta - 1) * log1pval)
-        - betaln(alpha, beta)
-    )
+        logval = aet.log(value)
+        log1pval = aet.log1p(-value)
+        logp = (
+            aet.switch(aet.eq(alpha, 1), 0, (alpha - 1) * logval)
+            + aet.switch(aet.eq(beta, 1), 0, (beta - 1) * log1pval)
+            - betaln(alpha, beta)
+        )
 
-    return bound(logp, value >= 0, value <= 1, alpha > 0, beta > 0)
+        return bound(logp, value >= 0, value <= 1, alpha > 0, beta > 0)
 
+    def logcdf(value, alpha, beta):
+        """
+        Compute the log of the cumulative distribution function for Beta distribution
+        at the specified value.
 
-@_logcdf.register(BetaRV)
-def beta_logcdf(op, value, alpha, beta):
-    """
-    Compute the log of the cumulative distribution function for Beta distribution
-    at the specified value.
+        Parameters
+        ----------
+        value: numeric
+            Value(s) for which log CDF is calculated.
 
-    Parameters
-    ----------
-    value: numeric
-        Value(s) for which log CDF is calculated.
+        Returns
+        -------
+        TensorVariable
+        """
+        # incomplete_beta function can only handle scalar values (see #4342)
+        if np.ndim(value):
+            raise TypeError(
+                f"Beta.logcdf expects a scalar value but received a {np.ndim(value)}-dimensional object."
+            )
 
-    Returns
-    -------
-    TensorVariable
-    """
-    # incomplete_beta function can only handle scalar values (see #4342)
-    if np.ndim(value):
-        raise TypeError(
-            f"Beta.logcdf expects a scalar value but received a {np.ndim(value)}-dimensional object."
+        return bound(
+            aet.switch(
+                aet.lt(value, 1),
+                aet.log(incomplete_beta(alpha, beta, value)),
+                0,
+            ),
+            0 <= value,
+            0 < alpha,
+            0 < beta,
         )
 
-    return bound(
-        aet.switch(
-            aet.lt(value, 1),
-            aet.log(incomplete_beta(alpha, beta, value)),
-            0,
-        ),
-        0 <= value,
-        0 < alpha,
-        0 < beta,
-    )
-
 
 class Kumaraswamy(UnitContinuous):
     r"""
@@ -1441,47 +1402,43 @@ def dist(cls, lam, *args, **kwargs):
         assert_negative_support(lam, "lam", "Exponential")
         return super().dist([lam], **kwargs)
 
+    def logp(value, lam):
+        """
+        Calculate log-probability of Exponential distribution at specified value.
 
-@_logp.register(ExponentialRV)
-def exponential_logp(op, value, lam):
-    """
-    Calculate log-probability of Exponential distribution at specified value.
-
-    Parameters
-    ----------
-    value: numeric
-        Value(s) for which log-probability is calculated. If the log probabilities for multiple
-        values are desired the values must be provided in a numpy array or aesara tensor
-
-    Returns
-    -------
-    TensorVariable
-    """
-    return bound(aet.log(lam) - lam * value, value >= 0, lam > 0)
+        Parameters
+        ----------
+        value: numeric
+            Value(s) for which log-probability is calculated. If the log probabilities for multiple
+            values are desired the values must be provided in a numpy array or aesara tensor
 
+        Returns
+        -------
+        TensorVariable
+        """
+        return bound(aet.log(lam) - lam * value, value >= 0, lam > 0)
 
-@_logcdf.register(ExponentialRV)
-def exponential_logcdf(op, value, lam):
-    r"""
-    Compute the log of cumulative distribution function for the Exponential distribution
-    at the specified value.
+    def logcdf(value, lam):
+        r"""
+        Compute the log of cumulative distribution function for the Exponential distribution
+        at the specified value.
 
-    Parameters
-    ----------
-    value: numeric or np.ndarray or aesara.tensor
-        Value(s) for which log CDF is calculated. If the log CDF for multiple
-        values are desired the values must be provided in a numpy array or aesara tensor.
+        Parameters
+        ----------
+        value: numeric or np.ndarray or aesara.tensor
+            Value(s) for which log CDF is calculated. If the log CDF for multiple
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
-    Returns
-    -------
-    TensorVariable
-    """
-    a = lam * value
-    return bound(
-        log1mexp(a),
-        0 <= value,
-        0 <= lam,
-    )
+        Returns
+        -------
+        TensorVariable
+        """
+        a = lam * value
+        return bound(
+            log1mexp(a),
+            0 <= value,
+            0 <= lam,
+        )
 
 
 class Laplace(Continuous):
@@ -2235,47 +2192,43 @@ def dist(cls, alpha, beta, *args, **kwargs):
         assert_negative_support(beta, "beta", "Cauchy")
         return super().dist([alpha, beta], **kwargs)
 
+    def logp(value, alpha, beta):
+        """
+        Calculate log-probability of Cauchy distribution at specified value.
 
-@_logp.register(CauchyRV)
-def cauchy_logp(op, value, alpha, beta):
-    """
-    Calculate log-probability of Cauchy distribution at specified value.
-
-    Parameters
-    ----------
-    value: numeric
-        Value(s) for which log-probability is calculated. If the log probabilities for multiple
-        values are desired the values must be provided in a numpy array or aesara tensor
-
-    Returns
-    -------
-    TensorVariable
-    """
-    return bound(
-        -aet.log(np.pi) - aet.log(beta) - aet.log1p(((value - alpha) / beta) ** 2), beta > 0
-    )
+        Parameters
+        ----------
+        value: numeric
+            Value(s) for which log-probability is calculated. If the log probabilities for multiple
+            values are desired the values must be provided in a numpy array or aesara tensor
 
+        Returns
+        -------
+        TensorVariable
+        """
+        return bound(
+            -aet.log(np.pi) - aet.log(beta) - aet.log1p(((value - alpha) / beta) ** 2), beta > 0
+        )
 
-@_logcdf.register(CauchyRV)
-def cauchy_logcdf(op, value, alpha, beta):
-    """
-    Compute the log of the cumulative distribution function for Cauchy distribution
-    at the specified value.
+    def logcdf(value, alpha, beta):
+        """
+        Compute the log of the cumulative distribution function for Cauchy distribution
+        at the specified value.
 
-    Parameters
-    ----------
-    value: numeric or np.ndarray or aesara.tensor
-        Value(s) for which log CDF is calculated. If the log CDF for multiple
-        values are desired the values must be provided in a numpy array or aesara tensor.
+        Parameters
+        ----------
+        value: numeric or np.ndarray or aesara.tensor
+            Value(s) for which log CDF is calculated. If the log CDF for multiple
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
-    Returns
-    -------
-    TensorVariable
-    """
-    return bound(
-        aet.log(0.5 + aet.arctan((value - alpha) / beta) / np.pi),
-        0 < beta,
-    )
+        Returns
+        -------
+        TensorVariable
+        """
+        return bound(
+            aet.log(0.5 + aet.arctan((value - alpha) / beta) / np.pi),
+            0 < beta,
+        )
 
 
 class HalfCauchy(PositiveContinuous):
@@ -2328,50 +2281,46 @@ def dist(cls, beta, *args, **kwargs):
         assert_negative_support(beta, "beta", "HalfCauchy")
         return super().dist([beta], **kwargs)
 
+    def logp(value, beta, alpha):
+        """
+        Calculate log-probability of HalfCauchy distribution at specified value.
 
-@_logp.register(HalfCauchyRV)
-def half_cauchy_logp(op, value, beta, alpha):
-    """
-    Calculate log-probability of HalfCauchy distribution at specified value.
-
-    Parameters
-    ----------
-    value: numeric
-        Value(s) for which log-probability is calculated. If the log probabilities for multiple
-        values are desired the values must be provided in a numpy array or aesara tensor
-
-    Returns
-    -------
-    TensorVariable
-    """
-    return bound(
-        aet.log(2) - aet.log(np.pi) - aet.log(beta) - aet.log1p((value / beta) ** 2),
-        value >= 0,
-        beta > 0,
-    )
+        Parameters
+        ----------
+        value: numeric
+            Value(s) for which log-probability is calculated. If the log probabilities for multiple
+            values are desired the values must be provided in a numpy array or aesara tensor
 
+        Returns
+        -------
+        TensorVariable
+        """
+        return bound(
+            aet.log(2) - aet.log(np.pi) - aet.log(beta) - aet.log1p((value / beta) ** 2),
+            value >= 0,
+            beta > 0,
+        )
 
-@_logcdf.register(HalfCauchyRV)
-def half_cauchy_logcdf(op, value, beta, alpha):
-    """
-    Compute the log of the cumulative distribution function for HalfCauchy distribution
-    at the specified value.
+    def logcdf(value, beta, alpha):
+        """
+        Compute the log of the cumulative distribution function for HalfCauchy distribution
+        at the specified value.
 
-    Parameters
-    ----------
-    value: numeric or np.ndarray or aesara.tensor
-        Value(s) for which log CDF is calculated. If the log CDF for multiple
-        values are desired the values must be provided in a numpy array or aesara tensor.
+        Parameters
+        ----------
+        value: numeric or np.ndarray or aesara.tensor
+            Value(s) for which log CDF is calculated. If the log CDF for multiple
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
-    Returns
-    -------
-    TensorVariable
-    """
-    return bound(
-        aet.log(2 * aet.arctan(value / beta) / np.pi),
-        0 <= value,
-        0 < beta,
-    )
+        Returns
+        -------
+        TensorVariable
+        """
+        return bound(
+            aet.log(2 * aet.arctan(value / beta) / np.pi),
+            0 <= value,
+            0 < beta,
+        )
 
 
 class Gamma(PositiveContinuous):
@@ -2471,57 +2420,53 @@ def get_alpha_beta(cls, alpha=None, beta=None, mu=None, sigma=None):
     def _distr_parameters_for_repr(self):
         return ["alpha", "beta"]
 
+    def logp(value, alpha, beta):
+        """
+        Calculate log-probability of Gamma distribution at specified value.
 
-@_logp.register(GammaRV)
-def gamma_logp(op, value, alpha, beta):
-    """
-    Calculate log-probability of Gamma distribution at specified value.
-
-    Parameters
-    ----------
-    value: numeric
-        Value(s) for which log-probability is calculated. If the log probabilities for multiple
-        values are desired the values must be provided in a numpy array or `TensorVariable`.
-
-    Returns
-    -------
-    TensorVariable
-    """
-    return bound(
-        -gammaln(alpha) + logpow(beta, alpha) - beta * value + logpow(value, alpha - 1),
-        value >= 0,
-        alpha > 0,
-        beta > 0,
-    )
+        Parameters
+        ----------
+        value: numeric
+            Value(s) for which log-probability is calculated. If the log probabilities for multiple
+            values are desired the values must be provided in a numpy array or `TensorVariable`.
 
+        Returns
+        -------
+        TensorVariable
+        """
+        return bound(
+            -gammaln(alpha) + logpow(beta, alpha) - beta * value + logpow(value, alpha - 1),
+            value >= 0,
+            alpha > 0,
+            beta > 0,
+        )
 
-@_logcdf.register(GammaRV)
-def gamma_logcdf(op, value, alpha, beta):
-    """
-    Compute the log of the cumulative distribution function for Gamma distribution
-    at the specified value.
+    def logcdf(value, alpha, beta):
+        """
+        Compute the log of the cumulative distribution function for Gamma distribution
+        at the specified value.
 
-    Parameters
-    ----------
-    value: numeric or np.ndarray or `TensorVariable`
-        Value(s) for which log CDF is calculated. If the log CDF for multiple
-        values are desired the values must be provided in a numpy array or `TensorVariable`.
+        Parameters
+        ----------
+        value: numeric or np.ndarray or `TensorVariable`
+            Value(s) for which log CDF is calculated. If the log CDF for multiple
+            values are desired the values must be provided in a numpy array or `TensorVariable`.
 
-    Returns
-    -------
-    TensorVariable
-    """
-    # Avoid C-assertion when the gammainc function is called with invalid values (#4340)
-    safe_alpha = aet.switch(aet.lt(alpha, 0), 0, alpha)
-    safe_beta = aet.switch(aet.lt(beta, 0), 0, beta)
-    safe_value = aet.switch(aet.lt(value, 0), 0, value)
+        Returns
+        -------
+        TensorVariable
+        """
+        # Avoid C-assertion when the gammainc function is called with invalid values (#4340)
+        safe_alpha = aet.switch(aet.lt(alpha, 0), 0, alpha)
+        safe_beta = aet.switch(aet.lt(beta, 0), 0, beta)
+        safe_value = aet.switch(aet.lt(value, 0), 0, value)
 
-    return bound(
-        aet.log(aet.gammainc(safe_alpha, safe_beta * safe_value)),
-        0 <= value,
-        0 < alpha,
-        0 < beta,
-    )
+        return bound(
+            aet.log(aet.gammainc(safe_alpha, safe_beta * safe_value)),
+            0 <= value,
+            0 < alpha,
+            0 < beta,
+        )
 
 
 class InverseGamma(PositiveContinuous):
@@ -2623,57 +2568,53 @@ def _get_alpha_beta(cls, alpha, beta, mu, sigma):
     def _distr_parameters_for_repr(self):
         return ["alpha", "beta"]
 
+    def logp(value, alpha, beta):
+        """
+        Calculate log-probability of InverseGamma distribution at specified value.
 
-@_logp.register(InvGammaRV)
-def inv_gamma_logp(op, value, alpha, beta):
-    """
-    Calculate log-probability of InverseGamma distribution at specified value.
-
-    Parameters
-    ----------
-    value: numeric
-        Value(s) for which log-probability is calculated. If the log probabilities for multiple
-        values are desired the values must be provided in a numpy array or aesara tensor
+        Parameters
+        ----------
+        value: numeric
+            Value(s) for which log-probability is calculated. If the log probabilities for multiple
+            values are desired the values must be provided in a numpy array or aesara tensor
 
-    Returns
-    -------
-    TensorVariable
-    """
-    return bound(
-        logpow(beta, alpha) - gammaln(alpha) - beta / value + logpow(value, -alpha - 1),
-        value > 0,
-        alpha > 0,
-        beta > 0,
-    )
+        Returns
+        -------
+        TensorVariable
+        """
+        return bound(
+            logpow(beta, alpha) - gammaln(alpha) - beta / value + logpow(value, -alpha - 1),
+            value > 0,
+            alpha > 0,
+            beta > 0,
+        )
 
+    def logcdf(value, alpha, beta):
+        """
+        Compute the log of the cumulative distribution function for Inverse Gamma distribution
+        at the specified value.
 
-@_logcdf.register(InvGammaRV)
-def inv_gamma_logcdf(op, value, alpha, beta):
-    """
-    Compute the log of the cumulative distribution function for Inverse Gamma distribution
-    at the specified value.
+        Parameters
+        ----------
+        value: numeric or np.ndarray or aesara.tensor
+            Value(s) for which log CDF is calculated. If the log CDF for multiple
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
-    Parameters
-    ----------
-    value: numeric or np.ndarray or aesara.tensor
-        Value(s) for which log CDF is calculated. If the log CDF for multiple
-        values are desired the values must be provided in a numpy array or aesara tensor.
+        Returns
+        -------
+        TensorVariable
+        """
+        # Avoid C-assertion when the gammaincc function is called with invalid values (#4340)
+        safe_alpha = aet.switch(aet.lt(alpha, 0), 0, alpha)
+        safe_beta = aet.switch(aet.lt(beta, 0), 0, beta)
+        safe_value = aet.switch(aet.lt(value, 0), 0, value)
 
-    Returns
-    -------
-    TensorVariable
-    """
-    # Avoid C-assertion when the gammaincc function is called with invalid values (#4340)
-    safe_alpha = aet.switch(aet.lt(alpha, 0), 0, alpha)
-    safe_beta = aet.switch(aet.lt(beta, 0), 0, beta)
-    safe_value = aet.switch(aet.lt(value, 0), 0, value)
-
-    return bound(
-        aet.log(aet.gammaincc(safe_alpha, safe_beta / safe_value)),
-        0 <= value,
-        0 < alpha,
-        0 < beta,
-    )
+        return bound(
+            aet.log(aet.gammaincc(safe_alpha, safe_beta / safe_value)),
+            0 <= value,
+            0 < alpha,
+            0 < beta,
+        )
 
 
 class ChiSquared(Gamma):
diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py
index a2076283f7..af1234307a 100644
--- a/pymc3/distributions/discrete.py
+++ b/pymc3/distributions/discrete.py
@@ -13,16 +13,13 @@
 #   limitations under the License.
 import warnings
 
-from copy import copy
-
 import aesara.tensor as aet
 import numpy as np
 
-from aesara.tensor.random.basic import BinomialRV, CategoricalRV, binomial, categorical
+from aesara.tensor.random.basic import bernoulli, binomial, categorical, nbinom, poisson
 from scipy import stats
 
 from pymc3.aesaraf import floatX, intX, take_along_axis
-from pymc3.distributions import _logcdf, _logp
 from pymc3.distributions.dist_math import (
     betaln,
     binomln,
@@ -35,7 +32,7 @@
     normal_lcdf,
 )
 from pymc3.distributions.distribution import Discrete
-from pymc3.math import log1mexp, log1pexp, logaddexp, logit, logsumexp, sigmoid, tround
+from pymc3.math import log1mexp, logaddexp, logsumexp, sigmoid, tround
 
 __all__ = [
     "Binomial",
@@ -56,12 +53,6 @@
     "OrderedLogistic",
 ]
 
-# FIXME: These are temporary hacks
-categorical = copy(categorical)
-categorical.inplace = True
-binomial = copy(binomial)
-binomial.inplace = True
-
 
 class Binomial(Discrete):
     R"""
@@ -114,66 +105,62 @@ def dist(cls, n, p, *args, **kwargs):
         # mode = aet.cast(tround(n * p), self.dtype)
         return super().dist([n, p], **kwargs)
 
+    def logp(value, n, p):
+        r"""
+        Calculate log-probability of Binomial distribution at specified value.
 
-@_logp.register(BinomialRV)
-def binomial_logp(op, value, n, p):
-    r"""
-    Calculate log-probability of Binomial distribution at specified value.
+        Parameters
+        ----------
+        value: numeric
+            Value(s) for which log-probability is calculated. If the log probabilities for multiple
+            values are desired the values must be provided in a numpy array or aesara tensor
 
-    Parameters
-    ----------
-    value: numeric
-        Value(s) for which log-probability is calculated. If the log probabilities for multiple
-        values are desired the values must be provided in a numpy array or aesara tensor
+        Returns
+        -------
+        TensorVariable
+        """
+        return bound(
+            binomln(n, value) + logpow(p, value) + logpow(1 - p, n - value),
+            0 <= value,
+            value <= n,
+            0 <= p,
+            p <= 1,
+        )
 
-    Returns
-    -------
-    TensorVariable
-    """
-    return bound(
-        binomln(n, value) + logpow(p, value) + logpow(1 - p, n - value),
-        0 <= value,
-        value <= n,
-        0 <= p,
-        p <= 1,
-    )
+    def logcdf(value, n, p):
+        """
+        Compute the log of the cumulative distribution function for Binomial distribution
+        at the specified value.
 
+        Parameters
+        ----------
+        value: numeric
+            Value for which log CDF is calculated.
 
-@_logcdf.register(BinomialRV)
-def binomial_logcdf(op, value, n, p):
-    """
-    Compute the log of the cumulative distribution function for Binomial distribution
-    at the specified value.
+        Returns
+        -------
+        TensorVariable
+        """
+        # incomplete_beta function can only handle scalar values (see #4342)
+        if np.ndim(value):
+            raise TypeError(
+                f"Binomial.logcdf expects a scalar value but received a {np.ndim(value)}-dimensional object."
+            )
 
-    Parameters
-    ----------
-    value: numeric
-        Value for which log CDF is calculated.
+        value = aet.floor(value)
 
-    Returns
-    -------
-    TensorVariable
-    """
-    # incomplete_beta function can only handle scalar values (see #4342)
-    if np.ndim(value):
-        raise TypeError(
-            f"Binomial.logcdf expects a scalar value but received a {np.ndim(value)}-dimensional object."
+        return bound(
+            aet.switch(
+                aet.lt(value, n),
+                aet.log(incomplete_beta(n - value, value + 1, 1 - p)),
+                0,
+            ),
+            0 <= value,
+            0 < n,
+            0 <= p,
+            p <= 1,
         )
 
-    value = aet.floor(value)
-
-    return bound(
-        aet.switch(
-            aet.lt(value, n),
-            aet.log(incomplete_beta(n - value, value + 1, 1 - p)),
-            0,
-        ),
-        0 <= value,
-        0 < n,
-        0 <= p,
-        p <= 1,
-    )
-
 
 class BetaBinomial(Discrete):
     R"""
@@ -281,7 +268,6 @@ def random(self, point=None, size=None):
         # return generate_samples(
         #     self._random, alpha=alpha, beta=beta, n=n, dist_shape=self.shape, size=size
         # )
-        pass
 
     def logp(self, value):
         r"""
@@ -385,41 +371,11 @@ class Bernoulli(Discrete):
     """
     rv_op = bernoulli
 
-    def __init__(self, p=None, logit_p=None, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        if sum(int(var is None) for var in [p, logit_p]) != 1:
-            raise ValueError("Specify one of p and logit_p")
-        if p is not None:
-            self._is_logit = False
-            self.p = p = aet.as_tensor_variable(floatX(p))
-            self._logit_p = logit(p)
-        else:
-            self._is_logit = True
-            self.p = aet.nnet.sigmoid(floatX(logit_p))
-            self._logit_p = aet.as_tensor_variable(logit_p)
-
-        self.mode = aet.cast(tround(self.p), "int8")
-
-    def random(self, point=None, size=None):
-        r"""
-        Draw random values from Bernoulli distribution.
-
-        Parameters
-        ----------
-        point: dict, optional
-            Dict of variable values on which random values are to be
-            conditioned (uses default point if not specified).
-        size: int, optional
-            Desired size of random sample (returns one sample if not
-            specified).
-
-        Returns
-        -------
-        array
-        """
-        # p = draw_values([self.p], point=point, size=size)[0]
-        # return generate_samples(stats.bernoulli.rvs, p, dist_shape=self.shape, size=size)
-        pass
+    @classmethod
+    def dist(cls, p=None, logit_p=None, *args, **kwargs):
+        p = aet.as_tensor_variable(floatX(p))
+        # mode = aet.cast(tround(p), "int8")
+        return super().dist([p], **kwargs)
 
     def logp(value, p):
         r"""
@@ -436,11 +392,11 @@ def logp(value, p):
         TensorVariable
         """
         # if self._is_logit:
-        #     lp = at.switch(value, self._logit_p, -self._logit_p)
+        #     lp = aet.switch(value, self._logit_p, -self._logit_p)
         #     return -log1pexp(-lp)
         # else:
         return bound(
-            at.switch(value, at.log(p), at.log(1 - p)),
+            aet.switch(value, aet.log(p), aet.log(1 - p)),
             value >= 0,
             value <= 1,
             p >= 0,
@@ -560,7 +516,6 @@ def random(self, point=None, size=None):
         """
         # q, beta = draw_values([self.q, self.beta], point=point, size=size)
         # return generate_samples(self._random, q, beta, dist_shape=self.shape, size=size)
-        pass
 
     def logp(self, value):
         r"""
@@ -659,31 +614,11 @@ class Poisson(Discrete):
     """
     rv_op = poisson
 
-    def __init__(self, mu, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.mu = mu = aet.as_tensor_variable(floatX(mu))
-        self.mode = intX(aet.floor(mu))
-
-    def random(self, point=None, size=None):
-        r"""
-        Draw random values from Poisson distribution.
-
-        Parameters
-        ----------
-        point: dict, optional
-            Dict of variable values on which random values are to be
-            conditioned (uses default point if not specified).
-        size: int, optional
-            Desired size of random sample (returns one sample if not
-            specified).
-
-        Returns
-        -------
-        array
-        """
-        # mu = draw_values([self.mu], point=point, size=size)[0]
-        # return generate_samples(stats.poisson.rvs, mu, dist_shape=self.shape, size=size)
-        pass
+    @classmethod
+    def dist(cls, mu, *args, **kwargs):
+        mu = aet.as_tensor_variable(floatX(mu))
+        # mode = intX(aet.floor(mu))
+        return super().dist([mu], *args, **kwargs)
 
     def logp(value, mu):
         r"""
@@ -718,7 +653,7 @@ def logcdf(value, mu):
         -------
         TensorVariable
         """
-        value = at.floor(value)
+        value = aet.floor(value)
         # Avoid C-assertion when the gammaincc function is called with invalid values (#4340)
         safe_mu = at.switch(at.lt(mu, 0), 0, mu)
         safe_value = at.switch(at.lt(value, 0), 0, value)
@@ -796,25 +731,27 @@ def NegBinom(a, m, x):
 
     @classmethod
     def dist(cls, mu=None, alpha=None, p=None, n=None, *args, **kwargs):
-        n, p = cls.get_mu_alpha(mu, alpha, p, n)
-        n = at.as_tensor_variable(floatX(n))
-        p = at.as_tensor_variable(floatX(p))
-        return super().dist([n, p], *args, **kwargs)
+        mu, alpha = cls.get_mu_alpha(mu, alpha, p, n)
+        mu = aet.as_tensor_variable(floatX(mu))
+        alpha = aet.as_tensor_variable(floatX(alpha))
+        # mode = intX(aet.floor(mu))
+        return super().dist([mu, alpha], *args, **kwargs)
 
     @classmethod
     def get_mu_alpha(cls, mu=None, alpha=None, p=None, n=None):
-        if n is None:
-            if alpha is not None:
-                n = at.as_tensor_variable(floatX(alpha))
+        if alpha is None:
+            if n is not None:
+                n = aet.as_tensor_variable(intX(n))
+                alpha = n
             else:
                 raise ValueError("Incompatible parametrization. Must specify either alpha or n.")
         elif alpha is not None:
             raise ValueError("Incompatible parametrization. Can't specify both alpha and n.")
 
-        if p is None:
-            if mu is not None:
-                mu = at.as_tensor_variable(floatX(mu))
-                p = n / (mu + n)
+        if mu is None:
+            if p is not None:
+                p = aet.as_tensor_variable(floatX(p))
+                mu = alpha * (1 - p) / p
             else:
                 raise ValueError("Incompatible parametrization. Must specify either mu or p.")
         elif mu is not None:
@@ -822,42 +759,7 @@ def get_mu_alpha(cls, mu=None, alpha=None, p=None, n=None):
 
         return mu, alpha
 
-    def random(self, point=None, size=None):
-        r"""
-        Draw random values from NegativeBinomial distribution.
-
-        Parameters
-        ----------
-        point: dict, optional
-            Dict of variable values on which random values are to be
-            conditioned (uses default point if not specified).
-        size: int, optional
-            Desired size of random sample (returns one sample if not
-            specified).
-
-        Returns
-        -------
-        array
-        """
-        # mu, alpha = draw_values([self.mu, self.alpha], point=point, size=size)
-        # g = generate_samples(self._random, mu=mu, alpha=alpha, dist_shape=self.shape, size=size)
-        # g[g == 0] = np.finfo(float).eps  # Just in case
-        # return np.asarray(stats.poisson.rvs(g)).reshape(g.shape)
-        pass
-
-    def _random(self, mu, alpha, size):
-        r"""Wrapper around stats.gamma.rvs that converts NegativeBinomial's
-        parametrization to scipy.gamma. All parameter arrays should have
-        been broadcasted properly by generate_samples at this point and size is
-        the scipy.rvs representation.
-        """
-        return stats.gamma.rvs(
-            a=alpha,
-            scale=mu / alpha,
-            size=size,
-        )
-
-    def logp(value, n, p):
+    def logp(value, mu, alpha):
         r"""
         Calculate log-probability of NegativeBinomial distribution at specified value.
 
@@ -871,8 +773,6 @@ def logp(value, n, p):
         -------
         TensorVariable
         """
-        alpha = n
-        mu = alpha * (1 - p) / p
         negbinom = bound(
             binomln(value + alpha - 1, value)
             + logpow(mu / (mu + alpha), value)
@@ -883,9 +783,9 @@ def logp(value, n, p):
         )
 
         # Return Poisson when alpha gets very large.
-        return at.switch(at.gt(alpha, 1e10), Poisson.logp(value, mu), negbinom)
+        return aet.switch(aet.gt(alpha, 1e10), Poisson.dist(mu).logp(value), negbinom)
 
-    def logcdf(value, n, p):
+    def logcdf(value, mu, alpha):
         """
         Compute the log of the cumulative distribution function for NegativeBinomial distribution
         at the specified value.
@@ -905,6 +805,9 @@ def logcdf(value, n, p):
                 f"NegativeBinomial.logcdf expects a scalar value but received a {np.ndim(value)}-dimensional object."
             )
 
+        # TODO: avoid `p` recomputation if distribution was defined in terms of `p`
+        p = alpha / (mu + alpha)
+
         return bound(
             at.log(incomplete_beta(n, at.floor(value) + 1, p)),
             0 <= value,
@@ -976,7 +879,6 @@ def random(self, point=None, size=None):
         """
         # p = draw_values([self.p], point=point, size=size)[0]
         # return generate_samples(np.random.geometric, p, dist_shape=self.shape, size=size)
-        pass
 
     def logp(self, value):
         r"""
@@ -1094,7 +996,6 @@ def random(self, point=None, size=None):
 
         # N, k, n = draw_values([self.N, self.k, self.n], point=point, size=size)
         # return generate_samples(self._random, N, k, n, dist_shape=self.shape, size=size)
-        pass
 
     def _random(self, M, n, N, size=None):
         r"""Wrapper around scipy stat's hypergeom.rvs"""
@@ -1248,7 +1149,6 @@ def random(self, point=None, size=None):
         """
         # lower, upper = draw_values([self.lower, self.upper], point=point, size=size)
         # return generate_samples(self._random, lower, upper, dist_shape=self.shape, size=size)
-        pass
 
     def logp(self, value):
         r"""
@@ -1349,42 +1249,40 @@ def dist(cls, p, **kwargs):
 
         return super().dist([p], **kwargs)
 
+    def logp(value, p):
+        r"""
+        Calculate log-probability of Categorical distribution at specified value.
 
-@_logp.register(CategoricalRV)
-def categorical_logp(op, value, p):
-    r"""
-    Calculate log-probability of Categorical distribution at specified value.
-
-    Parameters
-    ----------
-    value: numeric
-        Value(s) for which log-probability is calculated. If the log probabilities for multiple
-        values are desired the values must be provided in a numpy array or `TensorVariable`
+        Parameters
+        ----------
+        value: numeric
+            Value(s) for which log-probability is calculated. If the log probabilities for multiple
+            values are desired the values must be provided in a numpy array or `TensorVariable`
 
-    """
-    k = aet.shape(p)[-1]
-    p_ = p
-    p = p_ / aet.sum(p_, axis=-1, keepdims=True)
-    value_clip = aet.clip(value, 0, k - 1)
-
-    if p.ndim > 1:
-        if p.ndim > value_clip.ndim:
-            value_clip = aet.shape_padleft(value_clip, p_.ndim - value_clip.ndim)
-        elif p.ndim < value_clip.ndim:
-            p = aet.shape_padleft(p, value_clip.ndim - p_.ndim)
-        pattern = (p.ndim - 1,) + tuple(range(p.ndim - 1))
-        a = aet.log(
-            take_along_axis(
-                p.dimshuffle(pattern),
-                value_clip,
+        """
+        k = aet.shape(p)[-1]
+        p_ = p
+        p = p_ / aet.sum(p_, axis=-1, keepdims=True)
+        value_clip = aet.clip(value, 0, k - 1)
+
+        if p.ndim > 1:
+            if p.ndim > value_clip.ndim:
+                value_clip = aet.shape_padleft(value_clip, p_.ndim - value_clip.ndim)
+            elif p.ndim < value_clip.ndim:
+                p = aet.shape_padleft(p, value_clip.ndim - p_.ndim)
+            pattern = (p.ndim - 1,) + tuple(range(p.ndim - 1))
+            a = aet.log(
+                take_along_axis(
+                    p.dimshuffle(pattern),
+                    value_clip,
+                )
             )
-        )
-    else:
-        a = aet.log(p[value_clip])
+        else:
+            a = aet.log(p[value_clip])
 
-    return bound(
-        a, value >= 0, value <= (k - 1), aet.all(p_ >= 0, axis=-1), aet.all(p <= 1, axis=-1)
-    )
+        return bound(
+            a, value >= 0, value <= (k - 1), aet.all(p_ >= 0, axis=-1), aet.all(p <= 1, axis=-1)
+        )
 
 
 class Constant(Discrete):
@@ -1429,7 +1327,6 @@ def random(self, point=None, size=None):
         #     return np.full(size, fill_value=c, dtype=dtype)
         #
         # return generate_samples(_random, c=c, dist_shape=self.shape, size=size).astype(dtype)
-        pass
 
     def logp(self, value):
         r"""
@@ -1531,7 +1428,6 @@ def random(self, point=None, size=None):
         # g = generate_samples(stats.poisson.rvs, theta, dist_shape=self.shape, size=size)
         # g, psi = broadcast_distribution_samples([g, psi], size=size)
         # return g * (np.random.random(g.shape) < psi)
-        pass
 
     def logp(self, value):
         r"""
@@ -1664,7 +1560,6 @@ def random(self, point=None, size=None):
         # g = generate_samples(stats.binom.rvs, n, p, dist_shape=self.shape, size=size)
         # g, psi = broadcast_distribution_samples([g, psi], size=size)
         # return g * (np.random.random(g.shape) < psi)
-        pass
 
     def logp(self, value):
         r"""
@@ -1821,7 +1716,6 @@ def random(self, point=None, size=None):
         # g[g == 0] = np.finfo(float).eps  # Just in case
         # g, psi = broadcast_distribution_samples([g, psi], size=size)
         # return stats.poisson.rvs(g) * (np.random.random(g.shape) < psi)
-        pass
 
     def _random(self, mu, alpha, size):
         r"""Wrapper around stats.gamma.rvs that converts NegativeBinomial's
diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py
index acf00ef970..4706a92eec 100644
--- a/pymc3/distributions/distribution.py
+++ b/pymc3/distributions/distribution.py
@@ -18,14 +18,15 @@
 import types
 import warnings
 
-from abc import ABC
+from abc import ABCMeta
+from copy import copy
 from typing import TYPE_CHECKING
 
 import dill
 
 from aesara.tensor.random.op import RandomVariable
 
-from pymc3.distributions import _logcdf, _logp
+from pymc3.distributions import _logcdf, _logp, logp_transform
 
 if TYPE_CHECKING:
     from typing import Optional, Callable
@@ -61,9 +62,73 @@ class _Unpickling:
     pass
 
 
-class Distribution(ABC):
+class DistributionMeta(ABCMeta):
+    def __new__(cls, name, bases, clsdict):
+
+        new_cls = super().__new__(cls, name, bases, clsdict)
+
+        # Forcefully deprecate old v3 `Distribution`s
+        if "random" in clsdict:
+
+            def _random(*args, **kwargs):
+                warnings.warn(
+                    "The old `Distribution.random` interface is deprecated.",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+                return clsdict["random"](*args, **kwargs)
+
+            clsdict["random"] = _random
+
+        rv_op = clsdict.setdefault("rv_op", None)
+        rv_type = None
+
+        if isinstance(rv_op, RandomVariable):
+            if not rv_op.inplace:
+                # TODO: This is a temporary work-around.
+                # Remove this once we know what we want regarding RNG states
+                # and their propagation.
+                rv_op = copy(rv_op)
+                rv_op.inplace = True
+                clsdict["rv_op"] = rv_op
+
+            rv_type = type(rv_op)
+
+        if rv_type is not None:
+            # Create dispatch functions
+
+            class_logp = clsdict.get("logp")
+            if class_logp:
+
+                @_logp.register(rv_type)
+                def logp(op, value, *dist_params, **kwargs):
+                    return class_logp(value, *dist_params, **kwargs)
+
+            class_logcdf = clsdict.get("logcdf")
+            if class_logcdf:
+
+                @_logcdf.register(rv_type)
+                def logcdf(op, value, *dist_params, **kwargs):
+                    return class_logcdf(value, *dist_params, **kwargs)
+
+            class_transform = clsdict.get("transform")
+            if class_transform:
+
+                @logp_transform.register(rv_type)
+                def transform(op, *args, **kwargs):
+                    return class_transform(*args, **kwargs)
+
+            # Register the Aesara `RandomVariable` type as a subclass of this
+            # `Distribution` type.
+            new_cls.register(rv_type)
+
+        return new_cls
+
+
+class Distribution(metaclass=DistributionMeta):
     """Statistical distribution"""
 
+    rv_class = None
     rv_op = None
 
     def __new__(cls, name, *args, **kwargs):
diff --git a/pymc3/distributions/multivariate.py b/pymc3/distributions/multivariate.py
index fdccd70d9a..60f055b66f 100644
--- a/pymc3/distributions/multivariate.py
+++ b/pymc3/distributions/multivariate.py
@@ -17,8 +17,6 @@
 
 import warnings
 
-from copy import copy
-
 import aesara
 import aesara.tensor as at
 import numpy as np
@@ -27,7 +25,8 @@
 from aesara.graph.basic import Apply
 from aesara.graph.op import Op
 from aesara.tensor.nlinalg import det, eigh, matrix_inverse, trace
-from aesara.tensor.random.basic import DirichletRV, dirichlet
+from aesara.tensor.random.basic import MultinomialRV, dirichlet, multivariate_normal
+from aesara.tensor.random.utils import broadcast_params
 from aesara.tensor.slinalg import (
     Cholesky,
     Solve,
@@ -40,11 +39,10 @@
 import pymc3 as pm
 
 from pymc3.aesaraf import floatX, intX
-from pymc3.distributions import _logp, logp_transform, transforms
+from pymc3.distributions import transforms
 from pymc3.distributions.continuous import ChiSquared, Normal
 from pymc3.distributions.dist_math import bound, factln, logpow
 from pymc3.distributions.distribution import Continuous, Discrete
-from pymc3.distributions.shape_utils import to_tuple
 from pymc3.distributions.special import gammaln, multigammaln
 from pymc3.math import kron_diag, kron_dot, kron_solve_lower, kronecker
 
@@ -63,10 +61,51 @@
     "CAR",
 ]
 
-# FIXME: These are temporary hacks
-dirichlet = copy(dirichlet)
-dirichlet.inplace = True
-
+solve_lower = Solve(A_structure="lower_triangular")
+# Step methods and advi do not catch LinAlgErrors at the
+# moment. We work around that by using a cholesky op
+# that returns a nan as first entry instead of raising
+# an error.
+cholesky = Cholesky(lower=True, on_error="nan")
+
+
+def quaddist_matrix(cov=None, chol=None, tau=None, lower=True, *args, **kwargs):
+    if chol is not None and not lower:
+        chol = chol.T
+
+    if len([i for i in [tau, cov, chol] if i is not None]) != 1:
+        raise ValueError("Incompatible parameterization. Specify exactly one of tau, cov, or chol.")
+
+    if cov is not None:
+        cov = aet.as_tensor_variable(cov)
+        if cov.ndim != 2:
+            raise ValueError("cov must be two dimensional.")
+    elif tau is not None:
+        tau = aet.as_tensor_variable(tau)
+        if tau.ndim != 2:
+            raise ValueError("tau must be two dimensional.")
+        # TODO: What's the correct order/approach (in the non-square case)?
+        # `aesara.tensor.nlinalg.tensorinv`?
+        cov = matrix_inverse(tau)
+    else:
+        # TODO: What's the correct order/approach (in the non-square case)?
+        chol = aet.as_tensor_variable(chol)
+        if chol.ndim != 2:
+            raise ValueError("chol must be two dimensional.")
+        cov = chol.dot(chol.T)
+
+    return cov
+
+
+def quaddist_parse(value, mu, cov, mat_type="cov"):
+    """Compute (x - mu).T @ Sigma^-1 @ (x - mu) and the logdet of Sigma."""
+    if value.ndim > 2 or value.ndim == 0:
+        raise ValueError("Invalid dimension for value: %s" % value.ndim)
+    if value.ndim == 1:
+        onedim = True
+        value = value[None, :]
+    else:
+        onedim = False
 
     delta = value - mu
 
@@ -87,30 +126,30 @@
 
 
 def quaddist_chol(delta, chol_mat):
-    diag = at.diag(chol_mat)
+    diag = aet.nlinalg.diag(chol_mat)
     # Check if the covariance matrix is positive definite.
-    ok = at.all(diag > 0)
+    ok = aet.all(diag > 0)
     # If not, replace the diagonal. We return -inf later, but
     # need to prevent solve_lower from throwing an exception.
-    chol_cov = at.switch(ok, chol_mat, 1)
+    chol_cov = aet.switch(ok, chol_mat, 1)
 
     delta_trans = solve_lower(chol_cov, delta.T).T
     quaddist = (delta_trans ** 2).sum(axis=-1)
-    logdet = at.sum(at.log(diag))
+    logdet = aet.sum(aet.log(diag))
     return quaddist, logdet, ok
 
 
 def quaddist_tau(delta, chol_mat):
-    diag = at.nlinalg.diag(chol_mat)
+    diag = aet.nlinalg.diag(chol_mat)
     # Check if the precision matrix is positive definite.
-    ok = at.all(diag > 0)
+    ok = aet.all(diag > 0)
     # If not, replace the diagonal. We return -inf later, but
     # need to prevent solve_lower from throwing an exception.
-    chol_tau = at.switch(ok, chol_mat, 1)
+    chol_tau = aet.switch(ok, chol_mat, 1)
 
-    delta_trans = at.dot(delta, chol_tau)
+    delta_trans = aet.dot(delta, chol_tau)
     quaddist = (delta_trans ** 2).sum(axis=-1)
-    logdet = -at.sum(at.log(diag))
+    logdet = -aet.sum(aet.log(diag))
     return quaddist, logdet, ok
 
 
@@ -180,57 +219,11 @@ class MvNormal(Continuous):
     """
     rv_op = multivariate_normal
 
-    def __init__(self, mu, cov=None, tau=None, chol=None, lower=True, *args, **kwargs):
-        super().__init__(mu=mu, cov=cov, tau=tau, chol=chol, lower=lower, *args, **kwargs)
-        self.mean = self.median = self.mode = self.mu = self.mu
-
-    def random(self, point=None, size=None):
-        """
-        Draw random values from Multivariate Normal distribution.
-
-        Parameters
-        ----------
-        point: dict, optional
-            Dict of variable values on which random values are to be
-            conditioned (uses default point if not specified).
-        size: int, optional
-            Desired size of random sample (returns one sample if not
-            specified).
-
-        Returns
-        -------
-        array
-        """
-        # size = to_tuple(size)
-        #
-        # param_attribute = getattr(self, "chol_cov" if self._cov_type == "chol" else self._cov_type)
-        # mu, param = draw_values([self.mu, param_attribute], point=point, size=size)
-        #
-        # dist_shape = to_tuple(self.shape)
-        # output_shape = size + dist_shape
-        #
-        # # Simple, there can be only be 1 batch dimension, only available from `mu`.
-        # # Insert it into `param` before events, if there is a sample shape in front.
-        # if param.ndim > 2 and dist_shape[:-1]:
-        #     param = param.reshape(size + (1,) + param.shape[-2:])
-        #
-        # mu = broadcast_dist_samples_to(to_shape=output_shape, samples=[mu], size=size)[0]
-        # param = np.broadcast_to(param, shape=output_shape + dist_shape[-1:])
-        #
-        # assert mu.shape == output_shape
-        # assert param.shape == output_shape + dist_shape[-1:]
-        #
-        # if self._cov_type == "cov":
-        #     chol = np.linalg.cholesky(param)
-        # elif self._cov_type == "chol":
-        #     chol = param
-        # else:  # tau -> chol -> swapaxes (chol, -1, -2) -> inv ...
-        #     lower_chol = np.linalg.cholesky(param)
-        #     upper_chol = np.swapaxes(lower_chol, -1, -2)
-        #     chol = np.linalg.inv(upper_chol)
-        #
-        # standard_normal = np.random.standard_normal(output_shape)
-        # return mu + np.einsum("...ij,...j->...i", chol, standard_normal)
+    @classmethod
+    def dist(cls, mu, cov=None, tau=None, chol=None, lower=True, **kwargs):
+        mu = aet.as_tensor_variable(mu)
+        cov = quaddist_matrix(cov, tau, chol, lower)
+        return super().__init__([mu, cov], **kwargs)
 
     def logp(value, mu, cov):
         """
@@ -343,7 +336,7 @@ def random(self, point=None, size=None):
         # chi2_samples = chi2_samples.reshape(chi2_samples.shape + (1,) * len(self.shape))
         # return (samples / np.sqrt(chi2_samples / nu)) + mu
 
-    def logp(self, value):
+    def logp(value, nu, cov):
         """
         Calculate log-probability of Multivariate Student's T distribution
         at specified value.
@@ -361,7 +354,7 @@ def logp(self, value):
         k = floatX(value.shape[-1])
 
         norm = gammaln((nu + k) / 2.0) - gammaln(nu / 2.0) - 0.5 * k * floatX(np.log(nu * np.pi))
-        inner = -(nu + k) / 2.0 * at.log1p(quaddist / nu)
+        inner = -(nu + k) / 2.0 * aet.log1p(quaddist / nu)
         return bound(norm + inner - logdet, ok)
 
     def _distr_parameters_for_repr(self):
@@ -403,44 +396,64 @@ def dist(cls, a, **kwargs):
 
         return super().dist([a], **kwargs)
 
+    def logp(value, a):
+        """
+        Calculate log-probability of Dirichlet distribution
+        at specified value.
+
+        Parameters
+        ----------
+        value: numeric
+            Value for which log-probability is calculated.
+
+        Returns
+        -------
+        TensorVariable
+        """
+        # only defined for sum(value) == 1
+        return bound(
+            aet.sum(logpow(value, a - 1) - gammaln(a), axis=-1) + gammaln(aet.sum(a, axis=-1)),
+            aet.all(value >= 0),
+            aet.all(value <= 1),
+            aet.all(a > 0),
+            broadcast_conditions=False,
+        )
+
+    def transform(rv_var):
+
+        if rv_var.ndim == 1 or rv_var.broadcastable[-1]:
+            # If this variable is just a bunch of scalars/degenerate
+            # Dirichlets, we can't transform it
+            return None
+
+        return transforms.stick_breaking
+
     def _distr_parameters_for_repr(self):
         return ["a"]
 
 
-@logp_transform.register(DirichletRV)
-def dirichlet_transform(op, rv_var):
-
-    if rv_var.ndim == 1 or rv_var.broadcastable[-1]:
-        # If this variable is just a bunch of scalars/degenerate
-        # Dirichlets, we can't transform it
-        return None
+class MultinomialRV(MultinomialRV):
+    """Aesara's `MultinomialRV` doesn't broadcast; this one does."""
 
-    return transforms.stick_breaking
+    @classmethod
+    def rng_fn(cls, rng, n, p, size):
+        if n.ndim > 0 or p.ndim > 1:
+            n, p = broadcast_params([n, p], cls.ndims_params)
+            size = tuple(size or ())
 
+            if size:
+                n = np.broadcast_to(n, size + n.shape)
+                p = np.broadcast_to(p, size + p.shape)
 
-@_logp.register(DirichletRV)
-def dirichlet_logp(op, value, a):
-    """
-    Calculate log-probability of Dirichlet distribution
-    at specified value.
+            res = np.empty(p.shape)
+            for idx in np.ndindex(p.shape[:-1]):
+                res[idx] = rng.multinomial(n[idx], p[idx])
+            return res
+        else:
+            return rng.multinomial(n, p, size=size)
 
-    Parameters
-    ----------
-    value: numeric
-        Value for which log-probability is calculated.
 
-    Returns
-    -------
-    TensorVariable
-    """
-    # only defined for sum(value) == 1
-    return bound(
-        aet.sum(logpow(value, a - 1) - gammaln(a), axis=-1) + gammaln(aet.sum(a, axis=-1)),
-        aet.all(value >= 0),
-        aet.all(value <= 1),
-        aet.all(a > 0),
-        broadcast_conditions=False,
-    )
+multinomial = MultinomialRV()
 
 
 class MultinomialRV(MultinomialRV):
@@ -504,73 +517,18 @@ class Multinomial(Discrete):
     @classmethod
     def dist(cls, n, p, *args, **kwargs):
 
-        p = p / at.sum(p, axis=-1, keepdims=True)
-        n = at.as_tensor_variable(n)
-        p = at.as_tensor_variable(p)
+        # p = p / aet.sum(p, axis=-1, keepdims=True)
+        n = aet.as_tensor_variable(n)
+        p = aet.as_tensor_variable(p)
 
         # mean = n * p
-        # mode = at.cast(at.round(mean), "int32")
-        # diff = n - at.sum(mode, axis=-1, keepdims=True)
-        # inc_bool_arr = at.abs_(diff) > 0
-        # mode = at.inc_subtensor(mode[inc_bool_arr.nonzero()], diff[inc_bool_arr.nonzero()])
+        # mode = aet.cast(aet.round(mean), "int32")
+        # diff = n - aet.sum(mode, axis=-1, keepdims=True)
+        # inc_bool_arr = aet.abs_(diff) > 0
+        # mode = aet.inc_subtensor(mode[inc_bool_arr.nonzero()], diff[inc_bool_arr.nonzero()])
         return super().dist([n, p], *args, **kwargs)
 
-        # Thanks to the default shape handling done in generate_values, the last
-        # axis of n is a dummy axis that allows it to broadcast well with p
-        n = np.broadcast_to(n, size)
-        p = np.broadcast_to(p, size)
-        n = n[..., 0]
-
-        # np.random.multinomial needs `n` to be a scalar int and `p` a
-        # sequence so we semi flatten them and iterate over them
-        size_ = to_tuple(raw_size)
-        if p.ndim > len(size_) and p.shape[: len(size_)] == size_:
-            # p and n have the size_ prepend so we don't need it in np.random
-            n_ = n.reshape([-1])
-            p_ = p.reshape([-1, p.shape[-1]])
-            samples = np.array([np.random.multinomial(nn, pp) for nn, pp in zip(n_, p_)])
-            samples = samples.reshape(p.shape)
-        else:
-            # p and n don't have the size prepend
-            n_ = n.reshape([-1])
-            p_ = p.reshape([-1, p.shape[-1]])
-            samples = np.array(
-                [np.random.multinomial(nn, pp, size=size_) for nn, pp in zip(n_, p_)]
-            )
-            samples = np.moveaxis(samples, 0, -1)
-            samples = samples.reshape(size + p.shape)
-        # We cast back to the original dtype
-        return samples.astype(original_dtype)
-
-    def random(self, point=None, size=None):
-        """
-        Draw random values from Multinomial distribution.
-
-        Parameters
-        ----------
-        point: dict, optional
-            Dict of variable values on which random values are to be
-            conditioned (uses default point if not specified).
-        size: int, optional
-            Desired size of random sample (returns one sample if not
-            specified).
-
-        Returns
-        -------
-        array
-        """
-        # n, p = draw_values([self.n, self.p], point=point, size=size)
-        # samples = generate_samples(
-        #     self._random,
-        #     n,
-        #     p,
-        #     dist_shape=self.shape,
-        #     not_broadcast_kwargs={"raw_size": size},
-        #     size=size,
-        # )
-        # return samples
-
-    def logp(self, x):
+    def logp(value, n, p):
         """
         Calculate log-probability of Multinomial distribution
         at specified value.
@@ -585,12 +543,12 @@ def logp(self, x):
         TensorVariable
         """
         return bound(
-            factln(n) + at.sum(-factln(value) + logpow(p, value), axis=-1),
-            at.all(value >= 0),
-            at.all(at.eq(at.sum(value, axis=-1), n)),
-            at.all(p <= 1),
-            at.all(at.eq(at.sum(p, axis=-1), 1)),
-            at.all(at.ge(n, 0)),
+            factln(n) + aet.sum(-factln(value) + logpow(p, value), axis=-1),
+            aet.all(value >= 0),
+            aet.all(aet.eq(aet.sum(value, axis=-1), n)),
+            aet.all(p <= 1),
+            aet.all(aet.eq(aet.sum(p, axis=-1), 1)),
+            aet.all(aet.ge(n, 0)),
             broadcast_conditions=False,
         )
 
diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
index c644c1da3b..ad585e3074 100644
--- a/pymc3/tests/test_distributions.py
+++ b/pymc3/tests/test_distributions.py
@@ -1410,7 +1410,6 @@ def test_inverse_gamma_logcdf(self):
         condition=(aesara.config.floatX == "float32"),
         reason="Fails on float32 due to scaling issues",
     )
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_inverse_gamma_alt_params(self):
         def test_fun(value, mu, sigma):
             alpha, beta = InverseGamma._get_alpha_beta(None, None, mu, sigma)
@@ -1485,7 +1484,6 @@ def test_skew_normal(self):
             decimal=select_by_precision(float64=5, float32=3),
         )
 
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_binomial(self):
         self.check_logp(
             Binomial,
@@ -1554,22 +1552,13 @@ def test_beta_binomial_selfconsistency(self):
             {"alpha": Rplus, "beta": Rplus, "n": NatSmall},
         )
 
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
-    def test_bernoulli(self):
-        self.check_logp(
-            Bernoulli,
-            Bool,
-            {"logit_p": R},
-            lambda value, logit_p: sp.bernoulli.logpmf(value, scipy.special.expit(logit_p)),
-        )
-        self.check_logcdf(
-            Bernoulli,
-            Bool,
-            {"logit_p": R},
-            lambda value, logit_p: sp.bernoulli.logcdf(value, scipy.special.expit(logit_p)),
-        )
-
     def test_bernoulli(self):
+        # self.check_logp(
+        #     Bernoulli,
+        #     Bool,
+        #     {"logit_p": R},
+        #     lambda value, logit_p: sp.bernoulli.logpmf(value, scipy.special.expit(logit_p)),
+        # )
         self.check_logp(
             Bernoulli,
             Bool,
@@ -1582,6 +1571,12 @@ def test_bernoulli(self):
             {"p": Unit},
             lambda value, p: sp.bernoulli.logcdf(value, p),
         )
+        # self.check_logcdf(
+        #     Bernoulli,
+        #     Bool,
+        #     {"logit_p": R},
+        #     lambda value, logit_p: sp.bernoulli.logcdf(value, scipy.special.expit(logit_p)),
+        # )
         self.check_selfconsistency_discrete_logcdf(
             Bernoulli,
             Bool,
@@ -1602,7 +1597,6 @@ def test_discrete_weibull(self):
             {"q": Unit, "beta": Rplusdunif},
         )
 
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_poisson(self):
         self.check_logp(
             Poisson,
@@ -1990,31 +1984,28 @@ def test_dirichlet_2D(self):
         )
 
     @pytest.mark.parametrize("n", [2, 3])
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_multinomial(self, n):
         self.check_logp(
             Multinomial, Vector(Nat, n), {"p": Simplex(n), "n": Nat}, multinomial_logpdf
         )
 
-    @pytest.mark.skip(reason="Moment calculations have not been refactored yet")
-    @pytest.mark.parametrize(
-        "p,n",
-        [
-            [[0.25, 0.25, 0.25, 0.25], 1],
-            [[0.3, 0.6, 0.05, 0.05], 2],
-            [[0.3, 0.6, 0.05, 0.05], 10],
-        ],
-    )
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
-    def test_multinomial_mode(self, p, n):
-        _p = np.array(p)
-        with Model() as model:
-            m = Multinomial("m", n, _p, _p.shape)
-        assert_allclose(m.distribution.mode.eval().sum(), n)
-        _p = np.array([p, p])
-        with Model() as model:
-            m = Multinomial("m", n, _p, _p.shape)
-        assert_allclose(m.distribution.mode.eval().sum(axis=-1), n)
+    # @pytest.mark.parametrize(
+    #     "p,n",
+    #     [
+    #         [[0.25, 0.25, 0.25, 0.25], 1],
+    #         [[0.3, 0.6, 0.05, 0.05], 2],
+    #         [[0.3, 0.6, 0.05, 0.05], 10],
+    #     ],
+    # )
+    # def test_multinomial_mode(self, p, n):
+    #     _p = np.array(p)
+    #     with Model() as model:
+    #         m = Multinomial("m", n, _p, _p.shape)
+    #     assert_allclose(m.distribution.mode.eval().sum(), n)
+    #     _p = np.array([p, p])
+    #     with Model() as model:
+    #         m = Multinomial("m", n, _p, _p.shape)
+    #     assert_allclose(m.distribution.mode.eval().sum(axis=-1), n)
 
     @pytest.mark.parametrize(
         "p, size, n",
@@ -2036,32 +2027,30 @@ def test_multinomial_mode(self, p, n):
             [[[0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25]], (2, 4), [17, 19]],
         ],
     )
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
-    def test_multinomial_random(self, p, shape, n):
+    def test_multinomial_random(self, p, size, n):
         p = np.asarray(p)
         with Model() as model:
-            m = Multinomial("m", n=n, p=p, size=shape)
-        m.random()
+            m = Multinomial("m", n=n, p=p, size=size)
 
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
-    def test_multinomial_mode_with_shape(self):
-        n = [1, 10]
-        p = np.asarray([[0.25, 0.25, 0.25, 0.25], [0.26, 0.26, 0.26, 0.22]])
-        with Model() as model:
-            m = Multinomial("m", n=n, p=p, size=(2, 4))
-        assert_allclose(m.distribution.mode.eval().sum(axis=-1), n)
+        assert m.eval().shape == size + p.shape
+
+    # def test_multinomial_mode_with_shape(self):
+    #     n = [1, 10]
+    #     p = np.asarray([[0.25, 0.25, 0.25, 0.25], [0.26, 0.26, 0.26, 0.22]])
+    #     with Model() as model:
+    #         m = Multinomial("m", n=n, p=p, size=(2, 4))
+    #     assert_allclose(m.distribution.mode.eval().sum(axis=-1), n)
 
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_multinomial_vec(self):
         vals = np.array([[2, 4, 4], [3, 3, 4]])
         p = np.array([0.2, 0.3, 0.5])
         n = 10
 
         with Model() as model_single:
-            Multinomial("m", n=n, p=p, size=len(p))
+            Multinomial("m", n=n, p=p)
 
         with Model() as model_many:
-            Multinomial("m", n=n, p=p, size=vals.shape)
+            Multinomial("m", n=n, p=p, size=2)
 
         assert_almost_equal(
             scipy.stats.multinomial.logpmf(vals, n, p),
@@ -2081,14 +2070,13 @@ def test_multinomial_vec(self):
             decimal=4,
         )
 
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_multinomial_vec_1d_n(self):
         vals = np.array([[2, 4, 4], [4, 3, 4]])
         p = np.array([0.2, 0.3, 0.5])
         ns = np.array([10, 11])
 
         with Model() as model:
-            Multinomial("m", n=ns, p=p, size=vals.shape)
+            Multinomial("m", n=ns, p=p)
 
         assert_almost_equal(
             sum([multinomial_logpdf(val, n, p) for val, n in zip(vals, ns)]),
@@ -2096,14 +2084,13 @@ def test_multinomial_vec_1d_n(self):
             decimal=4,
         )
 
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_multinomial_vec_1d_n_2d_p(self):
         vals = np.array([[2, 4, 4], [4, 3, 4]])
         ps = np.array([[0.2, 0.3, 0.5], [0.9, 0.09, 0.01]])
         ns = np.array([10, 11])
 
         with Model() as model:
-            Multinomial("m", n=ns, p=ps, size=vals.shape)
+            Multinomial("m", n=ns, p=ps)
 
         assert_almost_equal(
             sum([multinomial_logpdf(val, n, p) for val, n, p in zip(vals, ns, ps)]),
@@ -2111,14 +2098,13 @@ def test_multinomial_vec_1d_n_2d_p(self):
             decimal=4,
         )
 
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_multinomial_vec_2d_p(self):
         vals = np.array([[2, 4, 4], [3, 3, 4]])
         ps = np.array([[0.2, 0.3, 0.5], [0.3, 0.3, 0.4]])
         n = 10
 
         with Model() as model:
-            Multinomial("m", n=n, p=ps, size=vals.shape)
+            Multinomial("m", n=n, p=ps)
 
         assert_almost_equal(
             sum([multinomial_logpdf(val, n, p) for val, p in zip(vals, ps)]),
@@ -2126,7 +2112,6 @@ def test_multinomial_vec_2d_p(self):
             decimal=4,
         )
 
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_batch_multinomial(self):
         n = 10
         vals = np.zeros((4, 5, 3), dtype="int32")
@@ -2135,7 +2120,8 @@ def test_batch_multinomial(self):
         np.put_along_axis(vals, inds, n, axis=-1)
         np.put_along_axis(p, inds, 1, axis=-1)
 
-        dist = Multinomial.dist(n=n, p=p, size=vals.shape)
+        dist = Multinomial.dist(n=n, p=p)
+
         value = aet.tensor3(dtype="int32")
         value.tag.test_value = np.zeros_like(vals, dtype="int32")
         logp = aet.exp(logpt(dist, value))

From 5bb71d94b0fc8469b538dd75c61c253c53b57a38 Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Tue, 16 Mar 2021 23:00:40 -0500
Subject: [PATCH 16/44] Make transform objects stateless

---
 pymc3/backends/base.py              |   2 +-
 pymc3/distributions/__init__.py     |  93 +++++++-----
 pymc3/distributions/continuous.py   |  27 ++--
 pymc3/distributions/multivariate.py |   4 +-
 pymc3/distributions/transforms.py   | 225 +++++++++++++---------------
 pymc3/model.py                      |   9 +-
 pymc3/tests/test_distributions.py   |  24 ++-
 pymc3/tests/test_transforms.py      |  88 +++++------
 8 files changed, 241 insertions(+), 231 deletions(-)

diff --git a/pymc3/backends/base.py b/pymc3/backends/base.py
index 2187ed914b..e9227cfd95 100644
--- a/pymc3/backends/base.py
+++ b/pymc3/backends/base.py
@@ -68,7 +68,7 @@ def __init__(self, name, model=None, vars=None, test_point=None):
                 if transform:
                     # We need to create and add an un-transformed version of
                     # each transformed variable
-                    untrans_var = transform.backward(var)
+                    untrans_var = transform.backward(v, var)
                     untrans_var.name = v.name
                     vars.append(untrans_var)
                 vars.append(var)
diff --git a/pymc3/distributions/__init__.py b/pymc3/distributions/__init__.py
index a7be6d0a7d..9f76c57d75 100644
--- a/pymc3/distributions/__init__.py
+++ b/pymc3/distributions/__init__.py
@@ -11,6 +11,8 @@
 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
+import warnings
+
 from functools import singledispatch
 from itertools import chain
 from typing import Generator, List, Optional, Tuple, Union
@@ -20,7 +22,7 @@
 
 from aesara import config
 from aesara.graph.basic import Variable, ancestors, clone_replace
-from aesara.graph.op import compute_test_value
+from aesara.graph.op import Op, compute_test_value
 from aesara.tensor.random.op import Observed, RandomVariable
 from aesara.tensor.subtensor import AdvancedSubtensor, AdvancedSubtensor1, Subtensor
 from aesara.tensor.var import TensorVariable
@@ -33,7 +35,7 @@
 
 
 @singledispatch
-def logp_transform(op, inputs):
+def logp_transform(op: Op):
     return None
 
 
@@ -141,7 +143,8 @@ def change_rv_size(
 
 def rv_log_likelihood_args(
     rv_var: TensorVariable,
-    transformed: Optional[bool] = True,
+    *,
+    return_observations: bool = True,
 ) -> Tuple[TensorVariable, TensorVariable]:
     """Get a `RandomVariable` and its corresponding log-likelihood `TensorVariable` value.
 
@@ -151,8 +154,9 @@ def rv_log_likelihood_args(
         A variable corresponding to a `RandomVariable`, whether directly or
         indirectly (e.g. an observed variable that's the output of an
         `Observed` `Op`).
-    transformed
-        When ``True``, return the transformed value var.
+    return_observations
+        When ``True``, return the observed values in place of the log-likelihood
+        value variable.
 
     Returns
     =======
@@ -163,12 +167,14 @@ def rv_log_likelihood_args(
     """
 
     if rv_var.owner and isinstance(rv_var.owner.op, Observed):
-        return tuple(rv_var.owner.inputs)
-    elif hasattr(rv_var.tag, "value_var"):
-        rv_value = rv_var.tag.value_var
-        return rv_var, rv_value
-    else:
-        return rv_var, None
+        rv_var, obs_var = rv_var.owner.inputs
+        if return_observations:
+            return rv_var, obs_var
+        else:
+            return rv_var, rv_log_likelihood_args(rv_var)[1]
+
+    rv_value = getattr(rv_var.tag, "value_var", None)
+    return rv_var, rv_value
 
 
 def rv_ancestors(graphs: List[TensorVariable]) -> Generator[TensorVariable, None, None]:
@@ -217,7 +223,7 @@ def sample_to_measure_vars(
         if not (anc.owner and isinstance(anc.owner.op, RandomVariable)):
             continue
 
-        _, value_var = rv_log_likelihood_args(anc)
+        _, value_var = rv_log_likelihood_args(anc, return_observations=False)
 
         if value_var is not None:
             replace[anc] = value_var
@@ -233,8 +239,10 @@ def sample_to_measure_vars(
 def logpt(
     rv_var: TensorVariable,
     rv_value: Optional[TensorVariable] = None,
-    jacobian: Optional[bool] = True,
-    scaling: Optional[bool] = True,
+    *,
+    jacobian: bool = True,
+    scaling: bool = True,
+    transformed: bool = True,
     **kwargs,
 ) -> TensorVariable:
     """Create a measure-space (i.e. log-likelihood) graph for a random variable at a given point.
@@ -257,6 +265,8 @@ def logpt(
         Whether or not to include the Jacobian term.
     scaling
         A scaling term to apply to the generated log-likelihood graph.
+    transformed
+        Apply transforms.
 
     """
 
@@ -282,22 +292,22 @@ def logpt(
 
             raise NotImplementedError("Missing value support is incomplete")
 
-            # "Flatten" and sum an array of indexed RVs' log-likelihoods
-            rv_var, missing_values = rv_node.inputs
-
-            missing_values = missing_values.data
-            logp_var = aet.sum(
-                [
-                    logpt(
-                        rv_var,
-                    )
-                    for idx, missing in zip(
-                        np.ndindex(missing_values.shape), missing_values.flatten()
-                    )
-                    if missing
-                ]
-            )
-            return logp_var
+            # # "Flatten" and sum an array of indexed RVs' log-likelihoods
+            # rv_var, missing_values = rv_node.inputs
+            #
+            # missing_values = missing_values.data
+            # logp_var = aet.sum(
+            #     [
+            #         logpt(
+            #             rv_var,
+            #         )
+            #         for idx, missing in zip(
+            #             np.ndindex(missing_values.shape), missing_values.flatten()
+            #         )
+            #         if missing
+            #     ]
+            # )
+            # return logp_var
 
         return aet.zeros_like(rv_var)
 
@@ -312,15 +322,16 @@ def logpt(
     # If any of the measure vars are transformed measure-space variables
     # (signified by having a `transform` value in their tags), then we apply
     # the their transforms and add their Jacobians (when enabled)
-    if transform:
-        logp_var = _logp(rv_node.op, transform.backward(rv_value_var), *dist_params, **kwargs)
+    if transform and transformed:
+        logp_var = _logp(rv_node.op, transform.backward(rv_var, rv_value), *dist_params, **kwargs)
+
         logp_var = transform_logp(
             logp_var,
             tuple(replacements.values()),
         )
 
         if jacobian:
-            transformed_jacobian = transform.jacobian_det(rv_value_var)
+            transformed_jacobian = transform.jacobian_det(rv_var, rv_value)
             if transformed_jacobian:
                 if logp_var.ndim > transformed_jacobian.ndim:
                     logp_var = logp_var.sum(axis=-1)
@@ -347,11 +358,17 @@ def transform_logp(logp_var: TensorVariable, inputs: List[TensorVariable]) -> Te
     for measure_var in inputs:
 
         transform = getattr(measure_var.tag, "transform", None)
+        rv_var = getattr(measure_var.tag, "rv_var", None)
+
+        if transform is not None and rv_var is None:
+            warnings.warn(
+                f"A transform was found for {measure_var} but not a corresponding random variable"
+            )
 
-        if transform is None:
+        if transform is None or rv_var is None:
             continue
 
-        trans_rv_value = transform.backward(measure_var)
+        trans_rv_value = transform.backward(rv_var, measure_var)
         trans_replacements[measure_var] = trans_rv_value
 
     if trans_replacements:
@@ -361,7 +378,7 @@ def transform_logp(logp_var: TensorVariable, inputs: List[TensorVariable]) -> Te
 
 
 @singledispatch
-def _logp(op, value, *dist_params, **kwargs):
+def _logp(op: Op, value: TensorVariable, *dist_params, **kwargs):
     """Create a log-likelihood graph.
 
     This function dispatches on the type of `op`, which should be a subclass
@@ -372,7 +389,9 @@ def _logp(op, value, *dist_params, **kwargs):
     return aet.zeros_like(value)
 
 
-def logcdf(rv_var, rv_value, jacobian=True, **kwargs):
+def logcdf(
+    rv_var: TensorVariable, rv_value: Optional[TensorVariable], jacobian: bool = True, **kwargs
+):
     """Create a log-CDF graph."""
 
     rv_var, _ = rv_log_likelihood_args(rv_var)
diff --git a/pymc3/distributions/continuous.py b/pymc3/distributions/continuous.py
index aa9d45c044..12fa7616c4 100644
--- a/pymc3/distributions/continuous.py
+++ b/pymc3/distributions/continuous.py
@@ -104,31 +104,24 @@ class BoundedContinuous(Continuous):
 
 
 @logp_transform.register(PositiveContinuous)
-def pos_cont_transform(op, rv_var):
+def pos_cont_transform(op):
     return transforms.log
 
 
 @logp_transform.register(UnitContinuous)
-def unit_cont_transform(op, rv_var):
+def unit_cont_transform(op):
     return transforms.logodds
 
 
 @logp_transform.register(BoundedContinuous)
-def bounded_cont_transform(op, rv_var):
-    _, _, _, lower, upper = rv_var.owner.inputs
-    lower = aet.as_tensor_variable(lower) if lower is not None else None
-    upper = aet.as_tensor_variable(upper) if upper is not None else None
-
-    if lower is None and upper is None:
-        transform = None
-    elif lower is not None and upper is None:
-        transform = transforms.lowerbound(lower)
-    elif lower is None and upper is not None:
-        transform = transforms.upperbound(upper)
-    else:
-        transform = transforms.interval(lower, upper)
-
-    return transform
+def bounded_cont_transform(op):
+    def transform_params(rv_var):
+        _, _, _, lower, upper = rv_var.owner.inputs
+        lower = aet.as_tensor_variable(lower) if lower is not None else None
+        upper = aet.as_tensor_variable(upper) if upper is not None else None
+        return lower, upper
+
+    return transforms.interval(transform_params)
 
 
 def assert_negative_support(var, label, distname, value=-1e-6):
diff --git a/pymc3/distributions/multivariate.py b/pymc3/distributions/multivariate.py
index 60f055b66f..b7f3386bb3 100644
--- a/pymc3/distributions/multivariate.py
+++ b/pymc3/distributions/multivariate.py
@@ -126,7 +126,7 @@ def quaddist_parse(value, mu, cov, mat_type="cov"):
 
 
 def quaddist_chol(delta, chol_mat):
-    diag = aet.nlinalg.diag(chol_mat)
+    diag = aet.diag(chol_mat)
     # Check if the covariance matrix is positive definite.
     ok = aet.all(diag > 0)
     # If not, replace the diagonal. We return -inf later, but
@@ -223,7 +223,7 @@ class MvNormal(Continuous):
     def dist(cls, mu, cov=None, tau=None, chol=None, lower=True, **kwargs):
         mu = aet.as_tensor_variable(mu)
         cov = quaddist_matrix(cov, tau, chol, lower)
-        return super().__init__([mu, cov], **kwargs)
+        return super().dist([mu, cov], **kwargs)
 
     def logp(value, mu, cov):
         """
diff --git a/pymc3/distributions/transforms.py b/pymc3/distributions/transforms.py
index 67f47f1028..dc54a7a444 100644
--- a/pymc3/distributions/transforms.py
+++ b/pymc3/distributions/transforms.py
@@ -12,9 +12,10 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
-import aesara.tensor as at
+import aesara.tensor as aet
 
 from aesara.tensor.subtensor import advanced_set_subtensor1
+from aesara.tensor.var import TensorVariable
 
 from pymc3.aesaraf import floatX, gradient
 from pymc3.math import invlogit, logit, logsumexp
@@ -61,10 +62,25 @@ def forward(self, rv_var: TensorVariable, rv_value: TensorVariable) -> TensorVar
         `rv_var`, it will transform the random variable `rv_value` after
         sampling from `rv_var`.
 
-    def backward(self, z):
-        """Applies inverse of transformation to input variable `z`.
-        When transform is used on some distribution `p`, which has observed values `z`, it is used to
-        transform the values of `z` correctly to the support of `p`.
+        **Do not apply transforms to `rv_var`.**  `rv_var` is only provided
+        as a means of describing the random variable associated with `rv_value`.
+        `rv_value` is the variable that should be transformed, and the transform
+        can use information from `rv_var`--within `param_extract_fn`--to do
+        that (e.g. the random variable's parameters via `rv_var.owner.inputs`).
+
+        Parameters
+        ----------
+        rv_var
+            The random variable.
+        rv_value
+            The variable representing a value of `rv_var`.
+
+        When a transform is applied to a value of some random variable
+        `rv_var`, it will transform the random variable `rv_value` after
+        sampling from `rv_var`.
+
+    def backward(self, rv_var: TensorVariable, rv_value: TensorVariable) -> TensorVariable:
+        """Applies inverse of transformation.
 
         Parameters
         ----------
@@ -103,8 +119,10 @@ def __str__(self):
 
 
 class ElemwiseTransform(Transform):
-    def jacobian_det(self, x):
-        grad = aet.reshape(gradient(aet.sum(self.backward(x)), [x]), x.shape)
+    def jacobian_det(self, rv_var, rv_value):
+        grad = aet.reshape(
+            gradient(aet.sum(self.backward(rv_var, rv_value)), [rv_value]), rv_value.shape
+        )
         return aet.log(aet.abs_(grad))
 
 
@@ -112,13 +130,13 @@ class Log(ElemwiseTransform):
     name = "log"
 
     def backward(self, rv_var, rv_value):
-        return at.exp(rv_value)
+        return aet.exp(rv_value)
 
     def forward(self, rv_var, rv_value):
-        return at.log(rv_value)
+        return aet.log(rv_value)
 
-    def jacobian_det(self, x):
-        return x
+    def jacobian_det(self, rv_var, rv_value):
+        return rv_value
 
 
 log = Log()
@@ -128,7 +146,7 @@ class LogExpM1(ElemwiseTransform):
     name = "log_exp_m1"
 
     def backward(self, rv_var, rv_value):
-        return at.nnet.softplus(rv_value)
+        return aet.nnet.softplus(rv_value)
 
     def forward(self, rv_var, rv_value):
         """Inverse operation of softplus.
@@ -136,10 +154,10 @@ def forward(self, rv_var, rv_value):
         y = Log(Exp(x) - 1)
           = Log(1 - Exp(-x)) + x
         """
-        return aet.log(1.0 - aet.exp(-x)) + x
+        return aet.log(1.0 - aet.exp(-rv_value)) + rv_value
 
-    def jacobian_det(self, x):
-        return -aet.nnet.softplus(-x)
+    def jacobian_det(self, rv_var, rv_value):
+        return -aet.nnet.softplus(-rv_value)
 
 
 log_exp_m1 = LogExpM1()
@@ -151,6 +169,9 @@ class LogOdds(ElemwiseTransform):
     def backward(self, rv_var, rv_value):
         return invlogit(rv_value, 0.0)
 
+    def forward(self, rv_var, rv_value):
+        return logit(rv_value)
+
 
 logodds = LogOdds()
 
@@ -160,95 +181,63 @@ class Interval(ElemwiseTransform):
 
     name = "interval"
 
-    def __init__(self, a, b):
-        self.a = aet.as_tensor_variable(a)
-        self.b = aet.as_tensor_variable(b)
-
-    def backward(self, x):
-        a, b = self.a, self.b
-        sigmoid_x = aet.nnet.sigmoid(x)
-        r = sigmoid_x * b + (1 - sigmoid_x) * a
-        return r
-
-    def forward(self, x):
-        a, b = self.a, self.b
-        return aet.log(x - a) - aet.log(b - x)
-
-    def jacobian_det(self, x):
-        s = aet.nnet.softplus(-x)
-        return aet.log(self.b - self.a) - 2 * s - x
-
-
-interval = Interval
-
-
-class LowerBound(ElemwiseTransform):
-    """Transform from real line interval [a,inf] to whole real line."""
-
-    name = "lowerbound"
-
-    def __init__(self, a):
-        self.a = aet.as_tensor_variable(a)
-
-    def backward(self, x):
-        a = self.a
-        r = aet.exp(x) + a
-        return r
-
-    def forward(self, x):
-        a = self.a
-        return aet.log(x - a)
-
-    def jacobian_det(self, x):
-        return x
-
-
-lowerbound = LowerBound
-"""
-Alias for ``LowerBound`` (:class: LowerBound) Transform (:class: Transform) class
-for use in the ``transform`` argument of a random variable.
-"""
-
-
-class UpperBound(ElemwiseTransform):
-    """Transform from real line interval [-inf,b] to whole real line."""
-
-    name = "upperbound"
+    def __init__(self, param_extract_fn):
+        self.param_extract_fn = param_extract_fn
 
-    def __init__(self, b):
-        self.b = aet.as_tensor_variable(b)
+    def backward(self, rv_var, rv_value):
+        a, b = self.param_extract_fn(rv_var)
+
+        if a is not None and b is not None:
+            sigmoid_x = aet.nnet.sigmoid(rv_value)
+            return sigmoid_x * b + (1 - sigmoid_x) * a
+        elif a is not None:
+            return aet.exp(rv_value) + a
+        elif b is not None:
+            return b - aet.exp(rv_value)
+        else:
+            return rv_value
 
-    def backward(self, x):
-        b = self.b
-        r = b - aet.exp(x)
-        return r
+    def forward(self, rv_var, rv_value):
+        a, b = self.param_extract_fn(rv_var)
+        if a is not None and b is not None:
+            return aet.log(rv_value - a) - aet.log(b - rv_value)
+        elif a is not None:
+            return aet.log(rv_value - a)
+        elif b is not None:
+            return aet.log(b - rv_value)
+        else:
+            return rv_value
 
-    def forward(self, x):
-        b = self.b
-        return aet.log(b - x)
+    def jacobian_det(self, rv_var, rv_value):
+        a, b = self.param_extract_fn(rv_var)
 
-    def jacobian_det(self, x):
-        return x
+        if a is not None and b is not None:
+            s = aet.nnet.softplus(-rv_value)
+            return aet.log(b - a) - 2 * s - rv_value
+        else:
+            return rv_value
 
 
-upperbound = UpperBound
-"""
-Alias for ``UpperBound`` (:class: UpperBound) Transform (:class: Transform) class
-for use in the ``transform`` argument of a random variable.
-"""
+interval = Interval
 
 
 class Ordered(Transform):
     name = "ordered"
 
     def backward(self, rv_var, rv_value):
-        x = at.zeros(rv_value.shape)
-        x = at.inc_subtensor(x[..., 0], rv_value[..., 0])
-        x = at.inc_subtensor(x[..., 1:], at.exp(rv_value[..., 1:]))
-        return at.cumsum(x, axis=-1)
+        x = aet.zeros(rv_value.shape)
+        x = aet.inc_subtensor(x[..., 0], rv_value[..., 0])
+        x = aet.inc_subtensor(x[..., 1:], aet.exp(rv_value[..., 1:]))
+        return aet.cumsum(x, axis=-1)
+
+    def forward(self, rv_var, rv_value):
+        y = aet.zeros(rv_value.shape)
+        y = aet.inc_subtensor(y[..., 0], rv_value[..., 0])
+        y = aet.inc_subtensor(y[..., 1:], aet.log(rv_value[..., 1:] - rv_value[..., :-1]))
+        return y
 
-    def jacobian_det(self, y):
-        return aet.sum(y[..., 1:], axis=-1)
+    def jacobian_det(self, rv_var, rv_value):
+        return aet.sum(rv_value[..., 1:], axis=-1)
 
 
 ordered = Ordered()
@@ -267,14 +256,14 @@ class SumTo1(Transform):
     name = "sumto1"
 
     def backward(self, rv_var, rv_value):
-        remaining = 1 - at.sum(rv_value[..., :], axis=-1, keepdims=True)
-        return at.concatenate([rv_value[..., :], remaining], axis=-1)
+        remaining = 1 - aet.sum(rv_value[..., :], axis=-1, keepdims=True)
+        return aet.concatenate([rv_value[..., :], remaining], axis=-1)
 
     def forward(self, rv_var, rv_value):
         return rv_value[..., :-1]
 
-    def jacobian_det(self, x):
-        y = aet.zeros(x.shape)
+    def jacobian_det(self, rv_var, rv_value):
+        y = aet.zeros(rv_value.shape)
         return aet.sum(y, axis=-1)
 
 
@@ -294,11 +283,6 @@ class StickBreaking(Transform):
     name = "stickbreaking"
 
     def forward(self, rv_var, rv_value):
-        if rv_var.broadcastable[-1]:
-            # If this variable is just a bunch of scalars/degenerate
-            # Dirichlets, we can't transform it
-            return rv_value
-
         x = rv_value.T
         n = x.shape[0]
         lx = at.log(x)
@@ -306,8 +290,8 @@ def forward(self, rv_var, rv_value):
         y = lx[:-1] - shift
         return floatX(y.T)
 
-    def backward(self, y_):
-        y = y_.T
+    def backward(self, rv_var, rv_value):
+        y = rv_value.T
         y = aet.concatenate([y, -aet.sum(y, 0, keepdims=True)])
         # "softmax" with vector support and no deprication warning:
         e_y = at.exp(y - at.max(y, 0, keepdims=True))
@@ -315,11 +299,6 @@ def backward(self, y_):
         return floatX(x.T)
 
     def jacobian_det(self, rv_var, rv_value):
-        if rv_var.broadcastable[-1]:
-            # If this variable is just a bunch of scalars/degenerate
-            # Dirichlets, we can't transform it
-            return at.ones_like(rv_value)
-
         y = rv_value.T
         Km1 = y.shape[0] + 1
         sy = at.sum(y, 0, keepdims=True)
@@ -338,10 +317,13 @@ class Circular(ElemwiseTransform):
     name = "circular"
 
     def backward(self, rv_var, rv_value):
-        return at.arctan2(at.sin(rv_value), at.cos(rv_value))
+        return aet.arctan2(aet.sin(rv_value), aet.cos(rv_value))
+
+    def forward(self, rv_var, rv_value):
+        return aet.as_tensor_variable(rv_value)
 
-    def jacobian_det(self, x):
-        return aet.zeros(x.shape)
+    def jacobian_det(self, rv_var, rv_value):
+        return aet.zeros(rv_value.shape)
 
 
 circular = Circular()
@@ -355,10 +337,15 @@ def __init__(self, param_extract_fn):
 
     def backward(self, rv_var, rv_value):
         diag_idxs = self.param_extract_fn(rv_var)
-        return advanced_set_subtensor1(rv_value, at.exp(rv_value[diag_idxs]), diag_idxs)
+        return advanced_set_subtensor1(rv_value, aet.exp(rv_value[diag_idxs]), diag_idxs)
 
-    def jacobian_det(self, y):
-        return aet.sum(y[self.diag_idxs])
+    def forward(self, rv_var, rv_value):
+        diag_idxs = self.param_extract_fn(rv_var)
+        return advanced_set_subtensor1(rv_value, aet.log(rv_value[diag_idxs]), diag_idxs)
+
+    def jacobian_det(self, rv_var, rv_value):
+        diag_idxs = self.param_extract_fn(rv_var)
+        return aet.sum(rv_value[diag_idxs])
 
 
 class Chain(Transform):
@@ -369,20 +356,20 @@ def __init__(self, transform_list):
         self.transform_list = transform_list
         self.name = "+".join([transf.name for transf in self.transform_list])
 
-    def forward(self, x):
-        y = x
+    def forward(self, rv_var, rv_value):
+        y = rv_value
         for transf in self.transform_list:
-            y = transf.forward(y)
+            y = transf.forward(rv_var, y)
         return y
 
-    def backward(self, y):
-        x = y
+    def backward(self, rv_var, rv_value):
+        x = rv_value
         for transf in reversed(self.transform_list):
             x = transf.backward(rv_var, x)
         return x
 
     def jacobian_det(self, rv_var, rv_value):
-        y = at.as_tensor_variable(rv_value)
+        y = aet.as_tensor_variable(rv_value)
         det_list = []
         ndim0 = y.ndim
         for transf in reversed(self.transform_list):
diff --git a/pymc3/model.py b/pymc3/model.py
index 1cb6e29c0a..7746532813 100644
--- a/pymc3/model.py
+++ b/pymc3/model.py
@@ -866,22 +866,23 @@ def register_rv(self, rv_var, name, data=None, total_size=None, dims=None, trans
             self.free_RVs.append(rv_var)
             value_var = rv_var.clone()
 
-            transform = transform or logp_transform(rv_var.owner.op, rv_var)
+            transform = transform or logp_transform(rv_var.owner.op)
 
             if transform is not None:
                 value_var.tag.transform = transform
                 value_var.name = f"{rv_var.name}_{transform.name}"
                 if aesara.config.compute_test_value != "off":
-                    value_var.tag.test_value = transform.forward(value_var).tag.test_value
+                    value_var.tag.test_value = transform.forward(rv_var, value_var).tag.test_value
 
                 # The transformed variable needs to be a named variable in the
                 # model, too
                 self.named_vars[value_var.name] = value_var
             else:
-                value_var = rv_var.clone()
                 value_var.name = rv_var.name
 
             rv_var.tag.value_var = value_var
+            # XXX: This is a circular reference.
+            value_var.tag.rv_var = rv_var
 
         elif isinstance(data, dict):
 
@@ -1496,6 +1497,8 @@ def make_obs_var(
     # variable `rv_var`).
     value_var = rv_var.clone()
     rv_var.tag.value_var = value_var
+    # XXX: This is a circular reference.
+    value_var.tag.rv_var = rv_var
     value_var.name = f"{rv_var.name}"
 
     missing_values = None
diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
index ad585e3074..f78615a7ac 100644
--- a/pymc3/tests/test_distributions.py
+++ b/pymc3/tests/test_distributions.py
@@ -642,7 +642,27 @@ def logp_reference(args):
         domains["value"] = domain
         for pt in product(domains, n_samples=n_samples):
             pt = dict(pt)
-            pt_logp = Point(pt, model=model)
+            pt_d = {}
+            for k, v in pt.items():
+                rv_var = model.named_vars.get(k)
+                nv = param_vars.get(k, rv_var)
+                nv = getattr(nv.tag, "value_var", nv)
+
+                transform = getattr(nv.tag, "transform", None)
+                if transform:
+                    # TODO: The compiled graph behind this should be cached and
+                    # reused (if it isn't already).
+                    v = transform.forward(rv_var, v).eval()
+
+                if nv.name in param_vars:
+                    # Update the shared parameter variables in `param_vars`
+                    param_vars[nv.name].set_value(v)
+                else:
+                    # Create an argument entry for the (potentially
+                    # transformed) "value" variable
+                    pt_d[nv.name] = v
+
+            pt_logp = Point(pt_d, model=model)
             pt_ref = Point(pt, filter_model_vars=False, model=model)
             assert_almost_equal(
                 logp(pt_logp),
@@ -1956,7 +1976,7 @@ def test_dirichlet_with_batch_shapes(self, dist_shape):
         d_value = d.tag.value_var
         d_point = d.eval()
         if hasattr(d_value.tag, "transform"):
-            d_point_trans = d_value.tag.transform.forward(d_point).eval()
+            d_point_trans = d_value.tag.transform.forward(d, d_point).eval()
         else:
             d_point_trans = d_point
 
diff --git a/pymc3/tests/test_transforms.py b/pymc3/tests/test_transforms.py
index 1d52fdf662..d473906d40 100644
--- a/pymc3/tests/test_transforms.py
+++ b/pymc3/tests/test_transforms.py
@@ -44,7 +44,7 @@
 tol = 1e-7 if aesara.config.floatX == "float64" else 1e-6
 
 
-def check_transform(transform, domain, constructor=at.dscalar, test=0, rv_var=None):
+def check_transform(transform, domain, constructor=aet.dscalar, test=0, rv_var=None):
     x = constructor("x")
     x.tag.test_value = test
     if rv_var is None:
@@ -59,14 +59,12 @@ def check_transform(transform, domain, constructor=at.dscalar, test=0, rv_var=No
 
 
 def check_vector_transform(transform, domain, rv_var=None):
-    return check_transform(transform, domain, at.dvector, test=np.array([0, 0]), rv_var=rv_var)
+    return check_transform(transform, domain, aet.dvector, test=np.array([0, 0]), rv_var=rv_var)
 
 
-def get_values(transform, domain=R, constructor=at.dscalar, test=0, rv_var=None):
+def get_values(transform, domain=R, constructor=aet.dscalar, test=0, rv_var=None):
     x = constructor("x")
     x.tag.test_value = test
-    if rv_var is None:
-        rv_var = x
     f = aesara.function([x], transform.backward(rv_var, x))
     return np.array([f(val) for val in domain.vals])
 
@@ -74,7 +72,7 @@ def get_values(transform, domain=R, constructor=at.dscalar, test=0, rv_var=None)
 def check_jacobian_det(
     transform,
     domain,
-    constructor=at.dscalar,
+    constructor=aet.dscalar,
     test=0,
     make_comparable=None,
     elemwise=False,
@@ -83,9 +81,6 @@ def check_jacobian_det(
     y = constructor("y")
     y.tag.test_value = test
 
-    if rv_var is None:
-        rv_var = y
-
     x = transform.backward(rv_var, y)
     if make_comparable:
         x = make_comparable(x)
@@ -99,7 +94,7 @@ def check_jacobian_det(
     actual_ljd = aesara.function([y], jac)
 
     computed_ljd = aesara.function(
-        [y], at.as_tensor_variable(transform.jacobian_det(rv_var, y)), on_unused_input="ignore"
+        [y], aet.as_tensor_variable(transform.jacobian_det(rv_var, y)), on_unused_input="ignore"
     )
 
     for yval in domain.vals:
@@ -132,7 +127,7 @@ def test_stickbreaking_accuracy():
     x = at.dvector("x")
     x.tag.test_value = val
     identity_f = aesara.function(
-        [x], tr.stick_breaking.forward(x, tr.stick_breaking.backward(x, x))
+        [x], tr.stick_breaking.forward(None, tr.stick_breaking.backward(None, x))
     )
     close_to(val, identity_f(val), tol)
 
@@ -289,14 +284,14 @@ def check_transform_elementwise_logp(self, model):
         pt = model.test_point
         array = np.random.randn(*pt[x0.name].shape)
         transform = x0.tag.transform
-        logp_nojac = logpt(x, transform.backward(array), jacobian=False)
-        jacob_det = transform.jacobian_det(aesara.shared(array))
-        assert logpt(x).ndim == jacob_det.ndim
+        logp_notrans = logpt(x, transform.backward(x, array), transformed=False)
 
         jacob_det = transform.jacobian_det(x, aesara.shared(array))
         assert logpt(x).ndim == jacob_det.ndim
 
-        close_to(logpt(x, array).eval(), elementwiselogp.eval(), tol)
+        v1 = logpt(x, array, jacobian=False).eval()
+        v2 = logp_notrans.eval()
+        close_to(v1, v2, tol)
 
     def check_vectortransform_elementwise_logp(self, model, vect_opt=0):
         x = model.free_RVs[0]
@@ -306,18 +301,14 @@ def check_vectortransform_elementwise_logp(self, model, vect_opt=0):
         pt = model.test_point
         array = np.random.randn(*pt[x0.name].shape)
         transform = x0.tag.transform
-        logp_nojac = logpt(x, transform.backward(array))
-        jacob_det = transform.jacobian_det(aesara.shared(array))
+        logp_nojac = logpt(x, transform.backward(x, array), transformed=False)
+
+        jacob_det = transform.jacobian_det(x, aesara.shared(array))
         assert logpt(x).ndim == jacob_det.ndim
 
-        if vect_opt == 0:
-            # the original distribution is univariate
-            elementwiselogp = logp_nojac.sum(axis=-1) + jacob_det
-        else:
-            elementwiselogp = logp_nojac + jacob_det
         # Hack to get relative tolerance
-        a = logpt(x, array).eval()
-        b = elementwiselogp.eval()
+        a = logpt(x, array, jacobian=False).eval()
+        b = logp_nojac.eval()
         close_to(a, b, np.abs(0.5 * (a + b) * tol))
 
     @pytest.mark.parametrize(
@@ -328,15 +319,13 @@ def check_vectortransform_elementwise_logp(self, model, vect_opt=0):
             (np.ones(3) * 10.0, (4, 3)),
         ],
     )
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
-    def test_half_normal(self, sd, shape):
-        model = self.build_model(pm.HalfNormal, {"sd": sd}, size=shape, transform=tr.log)
+    def test_half_normal(self, sd, size):
+        model = self.build_model(pm.HalfNormal, {"sd": sd}, size=size, transform=tr.log)
         self.check_transform_elementwise_logp(model)
 
-    @pytest.mark.parametrize("lam,shape", [(2.5, 2), (5.0, (2, 3)), (np.ones(3), (4, 3))])
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
-    def test_exponential(self, lam, shape):
-        model = self.build_model(pm.Exponential, {"lam": lam}, size=shape, transform=tr.log)
+    @pytest.mark.parametrize("lam,size", [(2.5, 2), (5.0, (2, 3)), (np.ones(3), (4, 3))])
+    def test_exponential(self, lam, size):
+        model = self.build_model(pm.Exponential, {"lam": lam}, size=size, transform=tr.log)
         self.check_transform_elementwise_logp(model)
 
     @pytest.mark.parametrize(
@@ -347,9 +336,8 @@ def test_exponential(self, lam, shape):
             (np.ones(3), np.ones(3), (4, 3)),
         ],
     )
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
-    def test_beta(self, a, b, shape):
-        model = self.build_model(pm.Beta, {"alpha": a, "beta": b}, size=shape, transform=tr.logodds)
+    def test_beta(self, a, b, size):
+        model = self.build_model(pm.Beta, {"alpha": a, "beta": b}, size=size, transform=tr.logodds)
         self.check_transform_elementwise_logp(model)
 
     @pytest.mark.parametrize(
@@ -363,13 +351,13 @@ def test_beta(self, a, b, shape):
     def test_uniform(self, lower, upper, size):
         def transform_params(rv_var):
             _, _, _, lower, upper = rv_var.owner.inputs
-            lower = at.as_tensor_variable(lower) if lower is not None else None
-            upper = at.as_tensor_variable(upper) if upper is not None else None
+            lower = aet.as_tensor_variable(lower) if lower is not None else None
+            upper = aet.as_tensor_variable(upper) if upper is not None else None
             return lower, upper
 
         interval = tr.Interval(transform_params)
         model = self.build_model(
-            pm.Uniform, {"lower": lower, "upper": upper}, size=shape, transform=interval
+            pm.Uniform, {"lower": lower, "upper": upper}, size=size, transform=interval
         )
         self.check_transform_elementwise_logp(model)
 
@@ -377,17 +365,17 @@ def transform_params(rv_var):
         "mu,kappa,size", [(0.0, 1.0, 2), (-0.5, 5.5, (2, 3)), (np.zeros(3), np.ones(3), (4, 3))]
     )
     @pytest.mark.xfail(reason="Distribution not refactored yet")
-    def test_vonmises(self, mu, kappa, shape):
+    def test_vonmises(self, mu, kappa, size):
         model = self.build_model(
-            pm.VonMises, {"mu": mu, "kappa": kappa}, size=shape, transform=tr.circular
+            pm.VonMises, {"mu": mu, "kappa": kappa}, size=size, transform=tr.circular
         )
         self.check_transform_elementwise_logp(model)
 
     @pytest.mark.parametrize(
         "a,size", [(np.ones(2), None), (np.ones((2, 3)) * 0.5, None), (np.ones(3), (4,))]
     )
-    def test_dirichlet(self, a, shape):
-        model = self.build_model(pm.Dirichlet, {"a": a}, size=shape, transform=tr.stick_breaking)
+    def test_dirichlet(self, a, size):
+        model = self.build_model(pm.Dirichlet, {"a": a}, size=size, transform=tr.stick_breaking)
         self.check_vectortransform_elementwise_logp(model, vect_opt=1)
 
     def test_normal_ordered(self):
@@ -413,7 +401,7 @@ def test_half_normal_ordered(self, sd, size):
         model = self.build_model(
             pm.HalfNormal,
             {"sd": sd},
-            size=shape,
+            size=size,
             testval=testval,
             transform=tr.Chain([tr.log, tr.ordered]),
         )
@@ -425,7 +413,7 @@ def test_exponential_ordered(self, lam, size):
         model = self.build_model(
             pm.Exponential,
             {"lam": lam},
-            size=shape,
+            size=size,
             testval=testval,
             transform=tr.Chain([tr.log, tr.ordered]),
         )
@@ -443,7 +431,7 @@ def test_beta_ordered(self, a, b, size):
         model = self.build_model(
             pm.Beta,
             {"alpha": a, "beta": b},
-            size=shape,
+            size=size,
             testval=testval,
             transform=tr.Chain([tr.logodds, tr.ordered]),
         )
@@ -456,8 +444,8 @@ def test_beta_ordered(self, a, b, size):
     def test_uniform_ordered(self, lower, upper, size):
         def transform_params(rv_var):
             _, _, _, lower, upper = rv_var.owner.inputs
-            lower = at.as_tensor_variable(lower) if lower is not None else None
-            upper = at.as_tensor_variable(upper) if upper is not None else None
+            lower = aet.as_tensor_variable(lower) if lower is not None else None
+            upper = aet.as_tensor_variable(upper) if upper is not None else None
             return lower, upper
 
         interval = tr.Interval(transform_params)
@@ -466,7 +454,7 @@ def transform_params(rv_var):
         model = self.build_model(
             pm.Uniform,
             {"lower": lower, "upper": upper},
-            size=shape,
+            size=size,
             testval=testval,
             transform=tr.Chain([interval, tr.ordered]),
         )
@@ -479,7 +467,7 @@ def test_vonmises_ordered(self, mu, kappa, size):
         model = self.build_model(
             pm.VonMises,
             {"mu": mu, "kappa": kappa},
-            size=shape,
+            size=size,
             testval=testval,
             transform=tr.Chain([tr.circular, tr.ordered]),
         )
@@ -498,7 +486,7 @@ def test_uniform_other(self, lower, upper, size, transform):
         model = self.build_model(
             pm.Uniform,
             {"lower": lower, "upper": upper},
-            size=shape,
+            size=size,
             testval=testval,
             transform=transform,
         )
@@ -514,6 +502,6 @@ def test_uniform_other(self, lower, upper, size, transform):
     def test_mvnormal_ordered(self, mu, cov, size, shape):
         testval = np.sort(np.random.randn(*shape))
         model = self.build_model(
-            pm.MvNormal, {"mu": mu, "cov": cov}, size=shape, testval=testval, transform=tr.ordered
+            pm.MvNormal, {"mu": mu, "cov": cov}, size=size, testval=testval, transform=tr.ordered
         )
         self.check_vectortransform_elementwise_logp(model, vect_opt=1)

From 319dc2e11f06597477e7e8de28cab9cc920455b9 Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Sat, 20 Mar 2021 13:10:57 -0500
Subject: [PATCH 17/44] Add non_sequences to uses of Scan Op

This make `aesara.graph.basic.clone_replace` work correctly when `Scan`s are
included in a graph.
---
 pymc3/distributions/dist_math.py | 4 ++--
 pymc3/step_methods/sgmcmc.py     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pymc3/distributions/dist_math.py b/pymc3/distributions/dist_math.py
index 3626e1b80f..6fe00849a5 100644
--- a/pymc3/distributions/dist_math.py
+++ b/pymc3/distributions/dist_math.py
@@ -546,8 +546,8 @@ def _step(i, t, s, a, b, value):
 
     (t, s), _ = scan(
         _step,
-        sequences=[at.arange(2, 302)],
-        outputs_info=[e for e in at.cast((t, s), "float64")],
+        sequences=[aet.arange(2, 302)],
+        outputs_info=[e for e in aet.cast((t, s), "float64")],
         non_sequences=[a, b, value],
     )
 
diff --git a/pymc3/step_methods/sgmcmc.py b/pymc3/step_methods/sgmcmc.py
index a3e4262b4d..9b3f299b32 100644
--- a/pymc3/step_methods/sgmcmc.py
+++ b/pymc3/step_methods/sgmcmc.py
@@ -65,7 +65,7 @@ def elemwise_dlogL(vars, model, flat_view):
     for var in vars:
         output, _ = aesara.scan(
             lambda i, logX, v: aesara.grad(logX[i], v).flatten(),
-            sequences=[at.arange(logL.shape[0])],
+            sequences=[aet.arange(logL.shape[0])],
             non_sequences=[logL, var],
         )
         terms.append(output)

From 1fde7e35e8474cc2dee6323c0fb309ca2dbfdcbd Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Sat, 20 Mar 2021 13:54:50 -0500
Subject: [PATCH 18/44] Replace Observed Op with tag.observations

---
 pymc3/distributions/__init__.py   |  52 ++++--------
 pymc3/model.py                    | 137 ++++++++++++------------------
 pymc3/model_graph.py              |   6 +-
 pymc3/sampling.py                 |   2 +-
 pymc3/tests/test_model.py         |  15 ++--
 pymc3/tests/test_model_helpers.py |  21 ++---
 6 files changed, 92 insertions(+), 141 deletions(-)

diff --git a/pymc3/distributions/__init__.py b/pymc3/distributions/__init__.py
index 9f76c57d75..c98104cd2f 100644
--- a/pymc3/distributions/__init__.py
+++ b/pymc3/distributions/__init__.py
@@ -23,7 +23,7 @@
 from aesara import config
 from aesara.graph.basic import Variable, ancestors, clone_replace
 from aesara.graph.op import Op, compute_test_value
-from aesara.tensor.random.op import Observed, RandomVariable
+from aesara.tensor.random.op import RandomVariable
 from aesara.tensor.subtensor import AdvancedSubtensor, AdvancedSubtensor1, Subtensor
 from aesara.tensor.var import TensorVariable
 
@@ -141,22 +141,16 @@ def change_rv_size(
     return rv_var
 
 
-def rv_log_likelihood_args(
-    rv_var: TensorVariable,
-    *,
-    return_observations: bool = True,
+def extract_rv_and_value_vars(
+    var: TensorVariable,
 ) -> Tuple[TensorVariable, TensorVariable]:
-    """Get a `RandomVariable` and its corresponding log-likelihood `TensorVariable` value.
+    """Extract a random variable and its corresponding value variable from a generic
+    `TensorVariable`.
 
     Parameters
     ==========
-    rv_var
-        A variable corresponding to a `RandomVariable`, whether directly or
-        indirectly (e.g. an observed variable that's the output of an
-        `Observed` `Op`).
-    return_observations
-        When ``True``, return the observed values in place of the log-likelihood
-        value variable.
+    var
+        A variable corresponding to a `RandomVariable`.
 
     Returns
     =======
@@ -165,16 +159,14 @@ def rv_log_likelihood_args(
     variable).
 
     """
+    if not var.owner:
+        return None, None
 
-    if rv_var.owner and isinstance(rv_var.owner.op, Observed):
-        rv_var, obs_var = rv_var.owner.inputs
-        if return_observations:
-            return rv_var, obs_var
-        else:
-            return rv_var, rv_log_likelihood_args(rv_var)[1]
+    if isinstance(var.owner.op, RandomVariable):
+        rv_value = getattr(var.tag, "value_var", None)
+        return var, rv_value
 
-    rv_value = getattr(rv_var.tag, "value_var", None)
-    return rv_var, rv_value
+    return None, None
 
 
 def rv_ancestors(graphs: List[TensorVariable]) -> Generator[TensorVariable, None, None]:
@@ -186,14 +178,6 @@ def rv_ancestors(graphs: List[TensorVariable]) -> Generator[TensorVariable, None
             yield anc
 
 
-def strip_observed(x: TensorVariable) -> TensorVariable:
-    """Return the `RandomVariable` term for an `Observed` node input; otherwise, return the input."""
-    if x.owner and isinstance(x.owner.op, Observed):
-        return x.owner.inputs[0]
-    else:
-        return x
-
-
 def sample_to_measure_vars(
     graphs: List[TensorVariable],
 ) -> Tuple[List[TensorVariable], List[TensorVariable]]:
@@ -223,7 +207,7 @@ def sample_to_measure_vars(
         if not (anc.owner and isinstance(anc.owner.op, RandomVariable)):
             continue
 
-        _, value_var = rv_log_likelihood_args(anc, return_observations=False)
+        _, value_var = extract_rv_and_value_vars(anc)
 
         if value_var is not None:
             replace[anc] = value_var
@@ -270,7 +254,7 @@ def logpt(
 
     """
 
-    rv_var, rv_value_var = rv_log_likelihood_args(rv_var)
+    rv_var, rv_value_var = extract_rv_and_value_vars(rv_var)
 
     if rv_value is None:
         rv_value = rv_value_var
@@ -311,8 +295,8 @@ def logpt(
 
         return aet.zeros_like(rv_var)
 
-    # This case should be reached when `rv_var` is either the result of an
-    # `Observed` or a `RandomVariable` `Op`
+    # This case should be reached when `rv_var` is the output of a
+    # `RandomVariable` `Op`
     rng, size, dtype, *dist_params = rv_node.inputs
 
     dist_params, replacements = sample_to_measure_vars(dist_params)
@@ -394,7 +378,7 @@ def logcdf(
 ):
     """Create a log-CDF graph."""
 
-    rv_var, _ = rv_log_likelihood_args(rv_var)
+    rv_var, _ = extract_rv_and_value_vars(rv_var)
     rv_node = rv_var.owner
 
     if not rv_node:
diff --git a/pymc3/model.py b/pymc3/model.py
index 7746532813..e9c34124e2 100644
--- a/pymc3/model.py
+++ b/pymc3/model.py
@@ -30,7 +30,6 @@
 from aesara.compile.sharedvalue import SharedVariable
 from aesara.gradient import grad
 from aesara.graph.basic import Constant, Variable, graph_inputs
-from aesara.tensor.random.op import Observed, observed
 from aesara.tensor.var import TensorVariable
 from pandas import Series
 
@@ -684,7 +683,9 @@ def logp_dlogp_function(self, grad_vars=None, tempered=False, **kwargs):
                         for var in self.free_RVs + self.potentials
                     ]
                 )
-                observed_RVs_logp = aet.sum([aet.sum(logpt(obs)) for obs in self.observed_RVs])
+                observed_RVs_logp = aet.sum(
+                    [aet.sum(logpt(obs, obs.tag.observations)) for obs in self.observed_RVs]
+                )
 
             costs = [free_RVs_logp, observed_RVs_logp]
         else:
@@ -699,7 +700,7 @@ def logpt(self):
         """Aesara scalar of log-probability of the model"""
         with self:
             factors = [logpt_sum(var, getattr(var.tag, "value_var", None)) for var in self.free_RVs]
-            factors += [logpt_sum(obs) for obs in self.observed_RVs]
+            factors += [logpt_sum(obs, obs.tag.observations) for obs in self.observed_RVs]
             factors += self.potentials
             logp_var = aet.sum([aet.sum(factor) for factor in factors])
             if self.name:
@@ -721,7 +722,9 @@ def logp_nojact(self):
                 logpt_sum(var, getattr(var.tag, "value_var", None), jacobian=False)
                 for var in self.free_RVs
             ]
-            factors += [logpt_sum(obs, jacobian=False) for obs in self.observed_RVs]
+            factors += [
+                logpt_sum(obs, obs.tag.observations, jacobian=False) for obs in self.observed_RVs
+            ]
             factors += self.potentials
             logp_var = aet.sum([aet.sum(factor) for factor in factors])
             if self.name:
@@ -741,7 +744,7 @@ def varlogpt(self):
     @property
     def datalogpt(self):
         with self:
-            factors = [logpt(obs) for obs in self.observed_RVs]
+            factors = [logpt(obs, obs.tag.observations) for obs in self.observed_RVs]
             factors += [aet.sum(factor) for factor in self.potentials]
             return aet.sum(factors)
 
@@ -855,56 +858,7 @@ def register_rv(self, rv_var, name, data=None, total_size=None, dims=None, trans
         rv_var.tag.total_size = total_size
 
         if data is None:
-            # Create a `TensorVariable` that will be used as the random
-            # variable's "value" in log-likelihood graphs.
-            #
-            # In general, we'll call this type of variable the "value" variable.
-            #
-            # In all other cases, the role of the value variable is taken by
-            # observed data. That's why value variables are only referenced in
-            # this branch of the conditional.
             self.free_RVs.append(rv_var)
-            value_var = rv_var.clone()
-
-            transform = transform or logp_transform(rv_var.owner.op)
-
-            if transform is not None:
-                value_var.tag.transform = transform
-                value_var.name = f"{rv_var.name}_{transform.name}"
-                if aesara.config.compute_test_value != "off":
-                    value_var.tag.test_value = transform.forward(rv_var, value_var).tag.test_value
-
-                # The transformed variable needs to be a named variable in the
-                # model, too
-                self.named_vars[value_var.name] = value_var
-            else:
-                value_var.name = rv_var.name
-
-            rv_var.tag.value_var = value_var
-            # XXX: This is a circular reference.
-            value_var.tag.rv_var = rv_var
-
-        elif isinstance(data, dict):
-
-            # TODO: How exactly does this dictionary map to `rv_var`?
-
-            # obs_rvs = {name: make_obs_var(rv_var, d, name, self) for name, d in data.items()}
-            # rv_var.tag.data = obs_rvs
-            #
-            # missing_values = [
-            #     datum.missing_values for datum in data.values() if datum.missing_values is not None
-            # ]
-            # rv_var.tag.missing_values = missing_values
-            #
-            # self.observed_RVs.append(rv_var)
-            #
-            # if missing_values:
-            #     self.free_RVs += rv_var.tag.missing_values
-            #     self.missing_values += rv_var.tag.missing_values
-            #     for v in rv_var.tag.missing_values:
-            #         self.named_vars[v.name] = v
-
-            raise NotImplementedError()
         else:
             if (
                 isinstance(data, Variable)
@@ -915,8 +869,7 @@ def register_rv(self, rv_var, name, data=None, total_size=None, dims=None, trans
 
             data = pandas_to_array(data)
 
-            rv_var = make_obs_var(rv_var, data, name, self)
-            rv_var.tag.data = data
+            rv_var = make_obs_var(rv_var, data)
 
             self.observed_RVs.append(rv_var)
 
@@ -925,6 +878,37 @@ def register_rv(self, rv_var, name, data=None, total_size=None, dims=None, trans
                 self.missing_values.append(rv_var.tag.missing_values)
                 self.named_vars[rv_var.tag.missing_values.name] = rv_var.tag.missing_values
 
+        # Create a `TensorVariable` that will be used as the random
+        # variable's "value" in log-likelihood graphs.
+        #
+        # In general, we'll call this type of variable the "value" variable.
+        #
+        # In all other cases, the role of the value variable is taken by
+        # observed data. That's why value variables are only referenced in
+        # this branch of the conditional.
+        value_var = rv_var.type()
+
+        if aesara.config.compute_test_value != "off":
+            value_var.tag.test_value = rv_var.tag.test_value
+
+        value_var.name = f"{rv_var.name}_value"
+
+        rv_var.tag.value_var = value_var
+
+        # Make the value variable a transformed value variable,
+        # if there's an applicable transform
+        transform = transform or logp_transform(rv_var.owner.op)
+
+        if transform is not None:
+            value_var.tag.transform = transform
+            value_var.name = f"{value_var.name}_{transform.name}__"
+            if aesara.config.compute_test_value != "off":
+                value_var.tag.test_value = transform.forward(rv_var, value_var).tag.test_value
+
+            # The transformed variable needs to be a named variable in the
+            # model, too
+            self.named_vars[value_var.name] = value_var
+
         self.add_random_variable(rv_var, dims)
 
         return rv_var
@@ -1447,9 +1431,7 @@ def pandas_to_array(data):
     mask = getattr(data, "mask", None)
     if mask is not None:
 
-def make_obs_var(
-    rv_var: TensorVariable, data: Union[np.ndarray], name: str, model: Model
-) -> TensorVariable:
+def make_obs_var(rv_var: TensorVariable, data: Union[np.ndarray]) -> TensorVariable:
     """Create a `TensorVariable` for an observed random variable.
 
     Parameters
@@ -1458,16 +1440,13 @@ def make_obs_var(
         The random variable that is observed.
     data: ndarray
         The observed data.
-    name: str
-        The name of the random variable.
-    model: Model
-        The model object.
 
     Returns
     =======
     The new observed random variable
 
     """
+    name = rv_var.name
     data = pandas_to_array(data).astype(rv_var.dtype)
 
     # The shapes of the observed random variable and its data might not
@@ -1488,18 +1467,12 @@ def make_obs_var(
 
     rv_var = change_rv_size(rv_var, new_size)
 
-    if aesara.config.compute_test_value != "off" and test_value is not None:
-        # We try to reuse the old test value
-        rv_var.tag.test_value = np.broadcast_to(test_value, rv_var.tag.test_value.shape)
-
-    # An independent variable used as the generic log-likelihood input
-    # parameter (i.e. the measure-space counterpart to the sample-space
-    # variable `rv_var`).
-    value_var = rv_var.clone()
-    rv_var.tag.value_var = value_var
-    # XXX: This is a circular reference.
-    value_var.tag.rv_var = rv_var
-    value_var.name = f"{rv_var.name}"
+    if aesara.config.compute_test_value != "off":
+        if test_value is not None:
+            # We try to reuse the old test value
+            rv_var.tag.test_value = np.broadcast_to(test_value, rv_var.tag.test_value.shape)
+        else:
+            rv_var.tag.test_value = data
 
     missing_values = None
     mask = getattr(data, "mask", None)
@@ -1514,21 +1487,15 @@ def make_obs_var(
         missing_values = rv_var[mask]
         constant = aet.as_tensor_variable(data.filled())
         data = aet.set_subtensor(constant[mask.nonzero()], missing_values)
-
-        # Now, we need log-likelihood-space terms for these missing values
-        value_var.name = f"{rv_var.name}_missing"
-
     elif sps.issparse(data):
         data = sparse.basic.as_sparse(data, name=name)
     else:
         data = aet.as_tensor_variable(data, name=name)
 
-    rv_obs = observed(rv_var, data)
-    rv_obs.tag.missing_values = missing_values
-
-    rv_obs.name = name
+    rv_var.tag.missing_values = missing_values
+    rv_var.tag.observations = data
 
-    return rv_obs
+    return rv_var
 
     rv_var.tag.observations = data
 
@@ -1619,7 +1586,7 @@ def as_iterargs(data):
 def all_continuous(vars):
     """Check that vars not include discrete variables or BART variables, excepting observed RVs."""
 
-    vars_ = [var for var in vars if not (var.owner and isinstance(var.owner.op, Observed))]
+    vars_ = [var for var in vars if not (var.owner and hasattr(var.tag, "observations"))]
     if any(
         [
             (var.dtype in pm.discrete_types or (var.owner and isinstance(var.owner.op, pm.BART)))
diff --git a/pymc3/model_graph.py b/pymc3/model_graph.py
index 47f6625b17..e35eaf1123 100644
--- a/pymc3/model_graph.py
+++ b/pymc3/model_graph.py
@@ -17,7 +17,7 @@
 
 from aesara.compile.sharedvalue import SharedVariable
 from aesara.graph.basic import walk
-from aesara.tensor.random.op import Observed
+from aesara.tensor.random.op import RandomVariable
 from aesara.tensor.var import TensorVariable
 
 import pymc3 as pm
@@ -112,7 +112,7 @@ def update_input_map(key: str, val: Set[VarName]):
         for var_name in self.var_names:
             var = self.model[var_name]
             update_input_map(var_name, self.get_parents(var))
-            if var.owner and isinstance(var.owner.op, Observed):
+            if hasattr(var.tag, "observations"):
                 try:
                     obs_name = var.tag.observations.name
                     if obs_name:
@@ -128,7 +128,7 @@ def _make_node(self, var_name, graph, *, formatting: str = "plain"):
 
         # styling for node
         attrs = {}
-        if v.owner and isinstance(v.owner.op, Observed):
+        if v.owner and isinstance(v.owner.op, RandomVariable) and hasattr(v.tag, "observations"):
             attrs["style"] = "filled"
 
         # make Data be roundtangle, instead of rectangle
diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index ff1b03f694..232bbf42cb 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -41,7 +41,7 @@
 from pymc3.backends.base import BaseTrace, MultiTrace
 from pymc3.backends.ndarray import NDArray
 from pymc3.blocking import DictToArrayBijection
-from pymc3.distributions import change_rv_size, rv_ancestors, strip_observed
+from pymc3.distributions import change_rv_size, rv_ancestors
 from pymc3.exceptions import IncorrectArgumentsError, SamplingError
 from pymc3.model import Model, Point, modelcontext
 from pymc3.parallel_sampling import Draw, _cpu_count
diff --git a/pymc3/tests/test_model.py b/pymc3/tests/test_model.py
index 85b515a3b6..db239d5ff7 100644
--- a/pymc3/tests/test_model.py
+++ b/pymc3/tests/test_model.py
@@ -29,6 +29,8 @@
 from aesara.tensor.var import TensorConstant
 from numpy.testing import assert_almost_equal
 
+from aesara.tensor.subtensor import AdvancedIncSubtensor
+
 import pymc3 as pm
 
 from pymc3 import Deterministic, Potential
@@ -201,17 +203,20 @@ def test_duplicate_vars():
 def test_empty_observed():
     data = pd.DataFrame(np.ones((2, 3)) / 3)
     data.values[:] = np.nan
-    with pm.Model():
+    with pm.Model(aesara_config={"compute_test_value": "raise"}):
         a = pm.Normal("a", observed=data)
+
+        assert isinstance(a.tag.observations.owner.op, AdvancedIncSubtensor)
         # The masked observations are replaced by elements of the RV `a`,
         # which means that they should all have the same sample test values
-        a_data = a.owner.inputs[1]
-        npt.assert_allclose(a.tag.test_value, a_data.tag.test_value)
+        a_data = a.tag.observations.owner.inputs[1]
+        npt.assert_allclose(a.tag.test_value.flatten(), a_data.tag.test_value)
 
         # Let's try this again with another distribution
         b = pm.Gamma("b", alpha=1, beta=1, observed=data)
-        b_data = b.owner.inputs[1]
-        npt.assert_allclose(b.tag.test_value, b_data.tag.test_value)
+        assert isinstance(b.tag.observations.owner.op, AdvancedIncSubtensor)
+        b_data = b.tag.observations.owner.inputs[1]
+        npt.assert_allclose(b.tag.test_value.flatten(), b_data.tag.test_value)
 
 
 class TestValueGradFunction(unittest.TestCase):
diff --git a/pymc3/tests/test_model_helpers.py b/pymc3/tests/test_model_helpers.py
index 93fdb97259..4acaee7dd3 100644
--- a/pymc3/tests/test_model_helpers.py
+++ b/pymc3/tests/test_model_helpers.py
@@ -108,7 +108,6 @@ def test_pandas_to_array(self, input_dtype):
         # Make sure the returned object is a Aesara TensorVariable
         assert isinstance(wrapped, TensorVariable)
 
-    @pytest.mark.xfail(reason="`Observed` `Op` doesn't take `SparseConstant`s, yet")
     def test_make_obs_var(self):
         """
         Check returned values for `data` given known inputs to `as_tensor()`.
@@ -127,20 +126,16 @@ def test_make_obs_var(self):
         with fake_model:
             fake_distribution = pm.Normal.dist(mu=0, sigma=1)
             # Create the testval attribute simply for the sake of model testing
-            fake_distribution.testval = None
+            fake_distribution.name = input_name
 
         # Check function behavior using the various inputs
-        dense_output = pm.model.make_obs_var(fake_distribution, dense_input, input_name, fake_model)
-        sparse_output = pm.model.make_obs_var(
-            fake_distribution, sparse_input, input_name, fake_model
-        )
-        masked_output = pm.model.make_obs_var(
-            fake_distribution, masked_array_input, input_name, fake_model
-        )
+        dense_output = pm.model.make_obs_var(fake_distribution, dense_input)
+        sparse_output = pm.model.make_obs_var(fake_distribution, sparse_input)
+        masked_output = pm.model.make_obs_var(fake_distribution, masked_array_input)
 
         # Ensure that the missing values are appropriately set to None
         for func_output in [dense_output, sparse_output]:
-            assert func_output.missing_values is None
+            assert func_output.tag.missing_values is None
 
         # Ensure that the Aesara variable names are correctly set.
         # Note that the output for masked inputs do not have their names set
@@ -149,11 +144,11 @@ def test_make_obs_var(self):
             assert func_output.name == input_name
 
         # Ensure the that returned functions are all of the correct type
-        assert isinstance(dense_output, TensorConstant)
-        assert sparse.basic._is_sparse_variable(sparse_output)
+        assert isinstance(dense_output.tag.observations, TensorConstant)
+        assert sparse.basic._is_sparse_variable(sparse_output.tag.observations)
 
         # Masked output is something weird. Just ensure it has missing values
         # self.assertIsInstance(masked_output, TensorConstant)
-        assert masked_output.missing_values is not None
+        assert masked_output.tag.missing_values is not None
 
         return None

From 158864584f09b5925f66cb7504a34fcf50a74e15 Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Sat, 20 Mar 2021 13:59:42 -0500
Subject: [PATCH 19/44] Comment out unused moments

---
 pymc3/distributions/continuous.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pymc3/distributions/continuous.py b/pymc3/distributions/continuous.py
index 12fa7616c4..55ac15d625 100644
--- a/pymc3/distributions/continuous.py
+++ b/pymc3/distributions/continuous.py
@@ -1155,8 +1155,8 @@ def dist(cls, alpha=None, beta=None, mu=None, sigma=None, sd=None, *args, **kwar
         alpha = aet.as_tensor_variable(floatX(alpha))
         beta = aet.as_tensor_variable(floatX(beta))
 
-        mean = alpha / (alpha + beta)
-        variance = (alpha * beta) / ((alpha + beta) ** 2 * (alpha + beta + 1))
+        # mean = alpha / (alpha + beta)
+        # variance = (alpha * beta) / ((alpha + beta) ** 2 * (alpha + beta + 1))
 
         assert_negative_support(alpha, "alpha", "Beta")
         assert_negative_support(beta, "beta", "Beta")

From 801c61a67b07816b07435d87ab3727b35a6de53b Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Sat, 20 Mar 2021 20:18:06 -0500
Subject: [PATCH 20/44] Make logpt work correctly for nested models and
 transforms

---
 pymc3/distributions/__init__.py     | 287 +++++++++++++++-------------
 pymc3/distributions/distribution.py |  19 +-
 pymc3/distributions/multivariate.py |  11 +-
 pymc3/distributions/transforms.py   |  17 +-
 pymc3/model.py                      |  27 +--
 pymc3/sampling.py                   |  11 +-
 pymc3/tests/test_distributions.py   |  98 +++++-----
 7 files changed, 250 insertions(+), 220 deletions(-)

diff --git a/pymc3/distributions/__init__.py b/pymc3/distributions/__init__.py
index c98104cd2f..8961badcd0 100644
--- a/pymc3/distributions/__init__.py
+++ b/pymc3/distributions/__init__.py
@@ -11,17 +11,15 @@
 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
-import warnings
 
 from functools import singledispatch
-from itertools import chain
-from typing import Generator, List, Optional, Tuple, Union
+from typing import Callable, Dict, Generator, Iterable, List, Optional, Tuple, Union
 
 import aesara.tensor as aet
 import numpy as np
 
 from aesara import config
-from aesara.graph.basic import Variable, ancestors, clone_replace
+from aesara.graph.basic import Variable, clone_replace, graph_inputs, io_toposort, walk
 from aesara.graph.op import Op, compute_test_value
 from aesara.tensor.random.op import RandomVariable
 from aesara.tensor.subtensor import AdvancedSubtensor, AdvancedSubtensor1, Subtensor
@@ -163,61 +161,105 @@ def extract_rv_and_value_vars(
         return None, None
 
     if isinstance(var.owner.op, RandomVariable):
-        rv_value = getattr(var.tag, "value_var", None)
+        rv_value = getattr(var.tag, "observations", getattr(var.tag, "value_var", None))
         return var, rv_value
 
     return None, None
 
 
-def rv_ancestors(graphs: List[TensorVariable]) -> Generator[TensorVariable, None, None]:
-    """Yield the ancestors that are `RandomVariable` outputs for the given `graphs`."""
-    for anc in ancestors(graphs):
-        if anc in graphs:
-            continue
-        if anc.owner and isinstance(anc.owner.op, RandomVariable):
-            yield anc
+def rv_ancestors(
+    graphs: Iterable[TensorVariable], walk_past_rvs: bool = False
+) -> Generator[TensorVariable, None, None]:
+    """Yield everything except the inputs of ``RandomVariable``s.
 
+    Parameters
+    ==========
+    graphs
+        The graphs to walk.
+    walk_past_rvs
+        If ``True``, do descend into ``RandomVariable``s.
+    """
 
-def sample_to_measure_vars(
-    graphs: List[TensorVariable],
-) -> Tuple[List[TensorVariable], List[TensorVariable]]:
-    """Replace sample-space variables in graphs with their measure-space counterparts.
+    def expand(var):
+        if var.owner and (walk_past_rvs or not isinstance(var.owner.op, RandomVariable)):
+            return reversed(var.owner.inputs)
 
-    Sample-space variables are `TensorVariable` outputs of `RandomVariable`
-    `Op`s.  Measure-space variables are `TensorVariable`s that correspond to
-    the value of a sample-space variable in a likelihood function (e.g. ``x``
-    in ``p(X = x)``, where ``X`` is the corresponding sample-space variable).
-    (``x`` is also the variable found in ``rv_var.tag.value_var``, so this
-    function could also be called ``sample_to_value_vars``.)
+    yield from walk(graphs, expand, False)
+
+
+def replace_rvs_in_graphs(
+    graphs: Iterable[TensorVariable],
+    replacement_fn: Callable[[TensorVariable], Dict[TensorVariable, TensorVariable]],
+    initial_replacements: Optional[Dict[TensorVariable, TensorVariable]] = None,
+) -> Tuple[TensorVariable, Dict[TensorVariable, TensorVariable]]:
+    """Replace random variables in graphs
+
+    This will *not* recompute test values.
 
     Parameters
     ==========
     graphs
-        The graphs in which random variables are to be replaced by their
-        measure variables.
+        The graphs in which random variables are to be replaced.
 
     Returns
     =======
     Tuple containing the transformed graphs and a ``dict`` of the replacements
     that were made.
     """
-    replace = {}
-    for anc in chain(rv_ancestors(graphs), graphs):
+    replacements = {}
+    if initial_replacements:
+        replacements.update(initial_replacements)
 
-        if not (anc.owner and isinstance(anc.owner.op, RandomVariable)):
-            continue
+    for var in rv_ancestors(graphs):
+        if var.owner and isinstance(var.owner.op, RandomVariable):
+            replacement_fn(var, replacements)
 
-        _, value_var = extract_rv_and_value_vars(anc)
+    if replacements:
+        graphs = clone_replace(graphs, replacements)
 
-        if value_var is not None:
-            replace[anc] = value_var
+    return graphs, replacements
+
+
+def rvs_to_value_vars(
+    graphs: Iterable[TensorVariable], initial_replacements: Dict[TensorVariable, TensorVariable]
+) -> Tuple[Iterable[TensorVariable], Dict[TensorVariable, TensorVariable]]:
+    """Replace random variables in graphs with their value variables.
+
+    This will *not* recompute test values.
+    """
+
+    def value_var_replacements(var, replacements):
+        rv_var, rv_value_var = extract_rv_and_value_vars(var)
+
+        if rv_value_var is not None:
+            replacements[var] = rv_value_var
+
+    return replace_rvs_in_graphs(graphs, value_var_replacements, initial_replacements)
 
-    if replace:
-        measure_graphs = clone_replace(graphs, replace=replace)
-    else:
-        measure_graphs = graphs
 
-    return measure_graphs, replace
+def apply_transforms(
+    graphs: Iterable[TensorVariable],
+) -> Tuple[TensorVariable, Dict[TensorVariable, TensorVariable]]:
+    """Apply the transforms associated with each random variable in `graphs`.
+
+    This will *not* recompute test values.
+    """
+
+    def transform_replacements(var, replacements):
+        rv_var, rv_value_var = extract_rv_and_value_vars(var)
+
+        if rv_value_var is None:
+            return
+
+        transform = getattr(rv_value_var.tag, "transform", None)
+
+        if transform is None:
+            return
+
+        trans_rv_value = transform.backward(rv_var, rv_value_var)
+        replacements[var] = trans_rv_value
+
+    return replace_rvs_in_graphs(graphs, transform_replacements)
 
 
 def logpt(
@@ -227,6 +269,8 @@ def logpt(
     jacobian: bool = True,
     scaling: bool = True,
     transformed: bool = True,
+    cdf: bool = False,
+    sum: bool = False,
     **kwargs,
 ) -> TensorVariable:
     """Create a measure-space (i.e. log-likelihood) graph for a random variable at a given point.
@@ -241,78 +285,65 @@ def logpt(
     rv_var
         The `RandomVariable` output that determines the log-likelihood graph.
     rv_value
-        The input variable for the log-likelihood graph.  If `rv_value` is
-        a transformed variable, its transformations will be applied.
-        If no value is provided, `rv_var.tag.value_var` will be checked and,
-        when available, used.
+        The variable that represents the value of `rv_var` in its
+        log-likelihood.  If no value is provided, `rv_var.tag.value_var` will
+        be checked and, when available, used.
     jacobian
         Whether or not to include the Jacobian term.
     scaling
         A scaling term to apply to the generated log-likelihood graph.
     transformed
         Apply transforms.
+    cdf
+        Return the log cumulative distribution.
+    sum
+        Sum the log-likelihood.
 
     """
 
     rv_var, rv_value_var = extract_rv_and_value_vars(rv_var)
 
     if rv_value is None:
+
+        if rv_value_var is None:
+            raise ValueError(f"No value variable specified or associated with {rv_var}")
+
         rv_value = rv_value_var
     else:
         rv_value = aet.as_tensor(rv_value)
 
-    if rv_value_var is None:
-        rv_value_var = rv_value
+        # Make sure that the value is compatible with the random variable
+        rv_value = rv_var.type.filter_variable(rv_value.astype(rv_var.dtype))
+
+        if rv_value_var is None:
+            rv_value_var = rv_value
 
     rv_node = rv_var.owner
 
     if not rv_node:
-        raise TypeError("rv_var must be the output of a RandomVariable Op")
+        return aet.zeros_like(rv_var)
 
     if not isinstance(rv_node.op, RandomVariable):
+        return _logp(rv_node.op, rv_value, rv_node.inputs)
 
-        # This will probably need another generic function...
-        if isinstance(rv_node.op, (Subtensor, AdvancedSubtensor, AdvancedSubtensor1)):
-
-            raise NotImplementedError("Missing value support is incomplete")
-
-            # # "Flatten" and sum an array of indexed RVs' log-likelihoods
-            # rv_var, missing_values = rv_node.inputs
-            #
-            # missing_values = missing_values.data
-            # logp_var = aet.sum(
-            #     [
-            #         logpt(
-            #             rv_var,
-            #         )
-            #         for idx, missing in zip(
-            #             np.ndindex(missing_values.shape), missing_values.flatten()
-            #         )
-            #         if missing
-            #     ]
-            # )
-            # return logp_var
-
-        return aet.zeros_like(rv_var)
-
-    # This case should be reached when `rv_var` is the output of a
-    # `RandomVariable` `Op`
     rng, size, dtype, *dist_params = rv_node.inputs
 
-    dist_params, replacements = sample_to_measure_vars(dist_params)
-
-    transform = getattr(rv_value_var.tag, "transform", None)
+    # Here, we plug the actual random variable into the log-likelihood graph,
+    # because we want a log-likelihood graph that only contains
+    # random variables.  This is important, because a random variable's
+    # parameters can contain random variables themselves.
+    # Ultimately, with a graph containing only random variables and
+    # "deterministics", we can simply replace all the random variables with
+    # their value variables and be done.
+    if not cdf:
+        logp_var = _logp(rv_node.op, rv_var, *dist_params, **kwargs)
+    else:
+        logp_var = _logcdf(rv_node.op, rv_var, *dist_params, **kwargs)
 
-    # If any of the measure vars are transformed measure-space variables
-    # (signified by having a `transform` value in their tags), then we apply
-    # the their transforms and add their Jacobians (when enabled)
-    if transform and transformed:
-        logp_var = _logp(rv_node.op, transform.backward(rv_var, rv_value), *dist_params, **kwargs)
+    transform = getattr(rv_value_var.tag, "transform", None) if rv_value_var else None
 
-        logp_var = transform_logp(
-            logp_var,
-            tuple(replacements.values()),
-        )
+    if transform and transformed and not cdf:
+        (logp_var,), _ = apply_transforms((logp_var,))
 
         if jacobian:
             transformed_jacobian = transform.jacobian_det(rv_var, rv_value)
@@ -320,47 +351,33 @@ def logpt(
                 if logp_var.ndim > transformed_jacobian.ndim:
                     logp_var = logp_var.sum(axis=-1)
                 logp_var += transformed_jacobian
-    else:
-        logp_var = _logp(rv_node.op, rv_value_var, *dist_params, **kwargs)
 
-    (logp_var,) = clone_replace([logp_var], replace={rv_value_var: rv_value})
+    # Replace random variables with their value variables
+    (logp_var,), replaced = rvs_to_value_vars((logp_var,), {rv_var: rv_value})
+
+    if rv_value_var != rv_value:
+        (logp_var,) = clone_replace((logp_var,), replace={rv_value_var: rv_value})
+
+    if sum:
+        logp_var = aet.sum(logp_var)
 
     if scaling:
         logp_var *= _get_scaling(
-            getattr(rv_var.tag, "total_size", None), rv_value_var.shape, rv_value_var.ndim
+            getattr(rv_var.tag, "total_size", None), rv_value.shape, rv_value.ndim
         )
 
+    # Recompute test values for the changes introduced by the replacements
+    # above.
+    if config.compute_test_value != "off":
+        for node in io_toposort(graph_inputs((logp_var,)), (logp_var,)):
+            compute_test_value(node)
+
     if rv_var.name is not None:
         logp_var.name = "__logp_%s" % rv_var.name
 
     return logp_var
 
 
-def transform_logp(logp_var: TensorVariable, inputs: List[TensorVariable]) -> TensorVariable:
-    """Transform the inputs of a log-likelihood graph."""
-    trans_replacements = {}
-    for measure_var in inputs:
-
-        transform = getattr(measure_var.tag, "transform", None)
-        rv_var = getattr(measure_var.tag, "rv_var", None)
-
-        if transform is not None and rv_var is None:
-            warnings.warn(
-                f"A transform was found for {measure_var} but not a corresponding random variable"
-            )
-
-        if transform is None or rv_var is None:
-            continue
-
-        trans_rv_value = transform.backward(rv_var, measure_var)
-        trans_replacements[measure_var] = trans_rv_value
-
-    if trans_replacements:
-        (logp_var,) = clone_replace([logp_var], trans_replacements)
-
-    return logp_var
-
-
 @singledispatch
 def _logp(op: Op, value: TensorVariable, *dist_params, **kwargs):
     """Create a log-likelihood graph.
@@ -373,26 +390,35 @@ def _logp(op: Op, value: TensorVariable, *dist_params, **kwargs):
     return aet.zeros_like(value)
 
 
-def logcdf(
-    rv_var: TensorVariable, rv_value: Optional[TensorVariable], jacobian: bool = True, **kwargs
-):
-    """Create a log-CDF graph."""
-
-    rv_var, _ = extract_rv_and_value_vars(rv_var)
-    rv_node = rv_var.owner
-
-    if not rv_node:
-        raise TypeError()
+@_logp.register(Subtensor)
+@_logp.register(AdvancedSubtensor)
+@_logp.register(AdvancedSubtensor1)
+def subtensor_logp(op, value, *inputs, **kwargs):
 
-    rv_value = aet.as_tensor(rv_value)
-
-    rng, size, dtype, *dist_params = rv_node.inputs
-
-    dist_params, replacements = sample_to_measure_vars(dist_params)
-
-    logp_var = _logcdf(rv_node.op, rv_value, *dist_params, **kwargs)
+    # TODO: Compute the log-likelihood for a subtensor/index operation.
+    raise NotImplementedError()
 
-    return logp_var
+    # "Flatten" and sum an array of indexed RVs' log-likelihoods
+    # rv_var, missing_values =
+    #
+    # missing_values = missing_values.data
+    # logp_var = aet.sum(
+    #     [
+    #         logpt(
+    #             rv_var,
+    #         )
+    #         for idx, missing in zip(
+    #             np.ndindex(missing_values.shape), missing_values.flatten()
+    #         )
+    #         if missing
+    #     ]
+    # )
+    # return logp_var
+
+
+def logcdf(*args, **kwargs):
+    """Create a log-CDF graph."""
+    return logpt(*args, cdf=True, **kwargs)
 
 
 @singledispatch
@@ -407,16 +433,15 @@ def _logcdf(op, value, *args, **kwargs):
     raise NotImplementedError()
 
 
-def logpt_sum(rv_var: TensorVariable, rv_value: Optional[TensorVariable] = None, **kwargs):
+def logpt_sum(*args, **kwargs):
     """Return the sum of the logp values for the given observations.
 
     Subclasses can use this to improve the speed of logp evaluations
     if only the sum of the logp values is needed.
     """
-    return aet.sum(logpt(rv_var, rv_value, **kwargs))
+    return logpt(*args, sum=True, **kwargs)
 
 
-from pymc3.distributions import shape_utils, timeseries, transforms
 from pymc3.distributions.bart import BART
 from pymc3.distributions.bound import Bound
 from pymc3.distributions.continuous import (
diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py
index 4706a92eec..f1192374ed 100644
--- a/pymc3/distributions/distribution.py
+++ b/pymc3/distributions/distribution.py
@@ -26,7 +26,7 @@
 
 from aesara.tensor.random.op import RandomVariable
 
-from pymc3.distributions import _logcdf, _logp, logp_transform
+from pymc3.distributions import _logcdf, _logp
 
 if TYPE_CHECKING:
     from typing import Optional, Callable
@@ -111,12 +111,12 @@ def logp(op, value, *dist_params, **kwargs):
                 def logcdf(op, value, *dist_params, **kwargs):
                     return class_logcdf(value, *dist_params, **kwargs)
 
-            class_transform = clsdict.get("transform")
-            if class_transform:
-
-                @logp_transform.register(rv_type)
-                def transform(op, *args, **kwargs):
-                    return class_transform(*args, **kwargs)
+            # class_transform = clsdict.get("transform")
+            # if class_transform:
+            #
+            #     @logp_transform.register(rv_type)
+            #     def transform(op, *args, **kwargs):
+            #         return class_transform(*args, **kwargs)
 
             # Register the Aesara `RandomVariable` type as a subclass of this
             # `Distribution` type.
@@ -327,7 +327,10 @@ class Discrete(Distribution):
 
     def __new__(cls, name, *args, **kwargs):
 
-        super().__init__(shape, dtype, defaults=defaults, *args, **kwargs)
+        if kwargs.get("transform", None):
+            raise ValueError("Transformations for discrete distributions")
+
+        return super().__new__(cls, name, *args, **kwargs)
 
 
 class Continuous(Distribution):
diff --git a/pymc3/distributions/multivariate.py b/pymc3/distributions/multivariate.py
index b7f3386bb3..803a8faf06 100644
--- a/pymc3/distributions/multivariate.py
+++ b/pymc3/distributions/multivariate.py
@@ -388,7 +388,7 @@ class Dirichlet(Continuous):
     rv_op = dirichlet
 
     @classmethod
-    def dist(cls, a, **kwargs):
+    def dist(cls, a, transform=transforms.stick_breaking, **kwargs):
 
         a = aet.as_tensor_variable(a)
         # mean = a / aet.sum(a)
@@ -419,15 +419,6 @@ def logp(value, a):
             broadcast_conditions=False,
         )
 
-    def transform(rv_var):
-
-        if rv_var.ndim == 1 or rv_var.broadcastable[-1]:
-            # If this variable is just a bunch of scalars/degenerate
-            # Dirichlets, we can't transform it
-            return None
-
-        return transforms.stick_breaking
-
     def _distr_parameters_for_repr(self):
         return ["a"]
 
diff --git a/pymc3/distributions/transforms.py b/pymc3/distributions/transforms.py
index dc54a7a444..5be28a5cde 100644
--- a/pymc3/distributions/transforms.py
+++ b/pymc3/distributions/transforms.py
@@ -215,7 +215,7 @@ def jacobian_det(self, rv_var, rv_value):
             s = aet.nnet.softplus(-rv_value)
             return aet.log(b - a) - 2 * s - rv_value
         else:
-            return rv_value
+            return aet.ones_like(rv_value)
 
 
 interval = Interval
@@ -283,6 +283,11 @@ class StickBreaking(Transform):
     name = "stickbreaking"
 
     def forward(self, rv_var, rv_value):
+        if rv_var.ndim == 1 or rv_var.broadcastable[-1]:
+            # If this variable is just a bunch of scalars/degenerate
+            # Dirichlets, we can't transform it
+            return rv_value
+
         x = rv_value.T
         n = x.shape[0]
         lx = at.log(x)
@@ -291,6 +296,11 @@ def forward(self, rv_var, rv_value):
         return floatX(y.T)
 
     def backward(self, rv_var, rv_value):
+        if rv_var.ndim == 1 or rv_var.broadcastable[-1]:
+            # If this variable is just a bunch of scalars/degenerate
+            # Dirichlets, we can't transform it
+            return rv_value
+
         y = rv_value.T
         y = aet.concatenate([y, -aet.sum(y, 0, keepdims=True)])
         # "softmax" with vector support and no deprication warning:
@@ -299,6 +309,11 @@ def backward(self, rv_var, rv_value):
         return floatX(x.T)
 
     def jacobian_det(self, rv_var, rv_value):
+        if rv_var.ndim == 1 or rv_var.broadcastable[-1]:
+            # If this variable is just a bunch of scalars/degenerate
+            # Dirichlets, we can't transform it
+            return aet.ones_like(rv_value)
+
         y = rv_value.T
         Km1 = y.shape[0] + 1
         sy = at.sum(y, 0, keepdims=True)
diff --git a/pymc3/model.py b/pymc3/model.py
index e9c34124e2..d318db0dcb 100644
--- a/pymc3/model.py
+++ b/pymc3/model.py
@@ -112,6 +112,8 @@ def incorporate_methods(source, destination, methods, wrapper=None, override=Fal
 
 T = TypeVar("T", bound="ContextMeta")
 
+no_transform_object = object()
+
 
 class ContextMeta(type):
     """Functionality for objects that put themselves in a context using
@@ -834,7 +836,9 @@ def add_coords(self, coords):
             else:
                 self.coords[name] = coords[name]
 
-    def register_rv(self, rv_var, name, data=None, total_size=None, dims=None, transform=None):
+    def register_rv(
+        self, rv_var, name, data=None, total_size=None, dims=None, transform=no_transform_object
+    ):
         """Register an (un)observed random variable with the model.
 
         Parameters
@@ -891,13 +895,14 @@ def register_rv(self, rv_var, name, data=None, total_size=None, dims=None, trans
         if aesara.config.compute_test_value != "off":
             value_var.tag.test_value = rv_var.tag.test_value
 
-        value_var.name = f"{rv_var.name}_value"
+        value_var.name = rv_var.name
 
         rv_var.tag.value_var = value_var
 
         # Make the value variable a transformed value variable,
         # if there's an applicable transform
-        transform = transform or logp_transform(rv_var.owner.op)
+        if transform is no_transform_object:
+            transform = logp_transform(rv_var.owner.op)
 
         if transform is not None:
             value_var.tag.transform = transform
@@ -905,10 +910,6 @@ def register_rv(self, rv_var, name, data=None, total_size=None, dims=None, trans
             if aesara.config.compute_test_value != "off":
                 value_var.tag.test_value = transform.forward(rv_var, value_var).tag.test_value
 
-            # The transformed variable needs to be a named variable in the
-            # model, too
-            self.named_vars[value_var.name] = value_var
-
         self.add_random_variable(rv_var, dims)
 
         return rv_var
@@ -965,7 +966,7 @@ def __getitem__(self, key):
             except KeyError:
                 raise e
 
-    def makefn(self, outs, mode=None, transformed=True, *args, **kwargs):
+    def makefn(self, outs, mode=None, *args, **kwargs):
         """Compiles a Aesara function which returns ``outs`` and takes the variable
         ancestors of ``outs`` as inputs.
 
@@ -979,11 +980,8 @@ def makefn(self, outs, mode=None, transformed=True, *args, **kwargs):
         Compiled Aesara function
         """
         with self:
-            vars = [
-                v if not transformed else getattr(v.tag, "transformed_var", v) for v in self.vars
-            ]
             return aesara.function(
-                vars,
+                self.vars,
                 outs,
                 allow_input_downcast=True,
                 on_unused_input="ignore",
@@ -1197,7 +1195,10 @@ def point_logps(self, point=None, round_vals=2):
 
         return Series(
             {
-                rv.name: np.round(self.fn(logpt_sum(rv))(test_point), round_vals)
+                rv.name: np.round(
+                    self.fn(logpt_sum(rv, getattr(rv.tag, "observations", None)))(test_point),
+                    round_vals,
+                )
                 for rv in self.basic_RVs
             },
             name="Log-probability of test_point",
diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index 232bbf42cb..d7bb2fbca5 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -1697,16 +1697,9 @@ def sample_posterior_predictive(
 
     if not hasattr(_trace, "varnames"):
         inputs_and_names = [
-            (rv, rv.name)
-            for rv in walk_model(vars_to_sample, walk_past_rvs=True)
-            if rv not in vars_to_sample
-            and rv in model.named_vars.values()
-            and not isinstance(rv, SharedVariable)
+            (rv, rv.name) for rv in rv_ancestors(vars_to_sample, walk_past_rvs=True)
         ]
-        if inputs_and_names:
-            inputs, input_names = zip(*inputs_and_names)
-        else:
-            inputs, input_names = [], []
+        inputs, input_names = zip(*inputs_and_names)
     else:
         output_names = [v.name for v in vars_to_sample if v.name is not None]
         input_names = [
diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
index f78615a7ac..acd7b1e2bb 100644
--- a/pymc3/tests/test_distributions.py
+++ b/pymc3/tests/test_distributions.py
@@ -98,6 +98,7 @@
     ZeroInflatedBinomial,
     ZeroInflatedNegativeBinomial,
     ZeroInflatedPoisson,
+    change_rv_size,
     continuous,
     logcdf,
     logpt,
@@ -765,7 +766,7 @@ def check_logcdf(
                 with Model() as m:
                     dist = pymc3_dist("y", **params)
                 params["value"] = value  # for displaying in err_msg
-                with aesara.config.change_flags(mode=Mode("py")):
+                with aesara.config.change_flags(on_opt_error="raise", mode=Mode("py")):
                     assert_almost_equal(
                         logcdf(dist, value).eval(),
                         scipy_cdf,
@@ -830,14 +831,8 @@ def check_logcdf(
                 )
 
         # Test that method works with multiple values or raises informative TypeError
-        try:
-            with aesara.config.change_flags(mode=Mode("py")):
-                logcdf(valid_dist, np.array([valid_value, valid_value])).eval()
-        except TypeError as err:
-            if not str(err).endswith(
-                ".logcdf expects a scalar value but received a 1-dimensional object."
-            ):
-                raise
+        with pytest.raises(TypeError), aesara.config.change_flags(mode=Mode("py")):
+            logcdf(valid_dist, np.array([valid_value, valid_value])).eval()
 
     def check_selfconsistency_discrete_logcdf(
         self, distribution, domain, paramdomains, decimal=None, n_samples=100
@@ -854,10 +849,13 @@ def check_selfconsistency_discrete_logcdf(
             value = params.pop("value")
             values = np.arange(domain.lower, value + 1)
             dist = distribution.dist(**params)
+            # This only works for scalar random variables
+            assert dist.owner.op.ndim_supp == 0
+            values_dist = change_rv_size(dist, values.shape)
             with aesara.config.change_flags(mode=Mode("py")):
                 assert_almost_equal(
                     logcdf(dist, value).eval(),
-                    logsumexp(logpt(dist, values), keepdims=False).eval(),
+                    logsumexp(logpt(values_dist, values), keepdims=False).eval(),
                     decimal=decimal,
                     err_msg=str(pt),
                 )
@@ -899,8 +897,8 @@ def test_uniform(self):
         invalid_dist = Uniform.dist(lower=1, upper=0)
 
         with aesara.config.change_flags(mode=Mode("py")):
-            assert logpt(invalid_dist, 0.5).eval() == -np.inf
-            assert logcdf(invalid_dist, 2).eval() == -np.inf
+            assert logpt(invalid_dist, np.array(0.5)).eval() == -np.inf
+            assert logcdf(invalid_dist, np.array(2.0)).eval() == -np.inf
 
     @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_triangular(self):
@@ -1572,13 +1570,22 @@ def test_beta_binomial_selfconsistency(self):
             {"alpha": Rplus, "beta": Rplus, "n": NatSmall},
         )
 
+    @pytest.mark.xfail(reason="Bernoulli logit_p not refactored yet")
+    def test_bernoulli_logit_p(self):
+        self.check_logp(
+            Bernoulli,
+            Bool,
+            {"logit_p": R},
+            lambda value, logit_p: sp.bernoulli.logpmf(value, scipy.special.expit(logit_p)),
+        )
+        self.check_logcdf(
+            Bernoulli,
+            Bool,
+            {"logit_p": R},
+            lambda value, logit_p: sp.bernoulli.logcdf(value, scipy.special.expit(logit_p)),
+        )
+
     def test_bernoulli(self):
-        # self.check_logp(
-        #     Bernoulli,
-        #     Bool,
-        #     {"logit_p": R},
-        #     lambda value, logit_p: sp.bernoulli.logpmf(value, scipy.special.expit(logit_p)),
-        # )
         self.check_logp(
             Bernoulli,
             Bool,
@@ -1591,12 +1598,6 @@ def test_bernoulli(self):
             {"p": Unit},
             lambda value, p: sp.bernoulli.logcdf(value, p),
         )
-        # self.check_logcdf(
-        #     Bernoulli,
-        #     Bool,
-        #     {"logit_p": R},
-        #     lambda value, logit_p: sp.bernoulli.logcdf(value, scipy.special.expit(logit_p)),
-        # )
         self.check_selfconsistency_discrete_logcdf(
             Bernoulli,
             Bool,
@@ -2009,23 +2010,24 @@ def test_multinomial(self, n):
             Multinomial, Vector(Nat, n), {"p": Simplex(n), "n": Nat}, multinomial_logpdf
         )
 
-    # @pytest.mark.parametrize(
-    #     "p,n",
-    #     [
-    #         [[0.25, 0.25, 0.25, 0.25], 1],
-    #         [[0.3, 0.6, 0.05, 0.05], 2],
-    #         [[0.3, 0.6, 0.05, 0.05], 10],
-    #     ],
-    # )
-    # def test_multinomial_mode(self, p, n):
-    #     _p = np.array(p)
-    #     with Model() as model:
-    #         m = Multinomial("m", n, _p, _p.shape)
-    #     assert_allclose(m.distribution.mode.eval().sum(), n)
-    #     _p = np.array([p, p])
-    #     with Model() as model:
-    #         m = Multinomial("m", n, _p, _p.shape)
-    #     assert_allclose(m.distribution.mode.eval().sum(axis=-1), n)
+    @pytest.mark.skip(reason="Moment calculations have not been refactored yet")
+    @pytest.mark.parametrize(
+        "p,n",
+        [
+            [[0.25, 0.25, 0.25, 0.25], 1],
+            [[0.3, 0.6, 0.05, 0.05], 2],
+            [[0.3, 0.6, 0.05, 0.05], 10],
+        ],
+    )
+    def test_multinomial_mode(self, p, n):
+        _p = np.array(p)
+        with Model() as model:
+            m = Multinomial("m", n, _p, _p.shape)
+        assert_allclose(m.distribution.mode.eval().sum(), n)
+        _p = np.array([p, p])
+        with Model() as model:
+            m = Multinomial("m", n, _p, _p.shape)
+        assert_allclose(m.distribution.mode.eval().sum(axis=-1), n)
 
     @pytest.mark.parametrize(
         "p, size, n",
@@ -2054,12 +2056,13 @@ def test_multinomial_random(self, p, size, n):
 
         assert m.eval().shape == size + p.shape
 
-    # def test_multinomial_mode_with_shape(self):
-    #     n = [1, 10]
-    #     p = np.asarray([[0.25, 0.25, 0.25, 0.25], [0.26, 0.26, 0.26, 0.22]])
-    #     with Model() as model:
-    #         m = Multinomial("m", n=n, p=p, size=(2, 4))
-    #     assert_allclose(m.distribution.mode.eval().sum(axis=-1), n)
+    @pytest.mark.skip(reason="Moment calculations have not been refactored yet")
+    def test_multinomial_mode_with_shape(self):
+        n = [1, 10]
+        p = np.asarray([[0.25, 0.25, 0.25, 0.25], [0.26, 0.26, 0.26, 0.22]])
+        with Model() as model:
+            m = Multinomial("m", n=n, p=p, size=(2, 4))
+        assert_allclose(m.distribution.mode.eval().sum(axis=-1), n)
 
     def test_multinomial_vec(self):
         vals = np.array([[2, 4, 4], [3, 3, 4]])
@@ -2784,7 +2787,6 @@ def test_str(self):
             assert str_repr in model_str
 
 
-@pytest.mark.xfail(reason="Distribution not refactored yet")
 def test_discrete_trafo():
     with Model():
         with pytest.raises(ValueError) as err:

From 8bcbe5989a8f4477057d912885d9ddd7e32b28f0 Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Mon, 22 Mar 2021 22:10:30 -0500
Subject: [PATCH 21/44] Make Metropolis, Slice, PGBART, MetropolisMLDA use
 point values

---
 pymc3/aesaraf.py                 | 27 ++++++---------
 pymc3/smc/smc.py                 |  6 ++--
 pymc3/step_methods/arraystep.py  | 57 +++++++++++++++++++++++++-------
 pymc3/step_methods/metropolis.py | 20 +++++------
 pymc3/step_methods/mlda.py       | 34 +++++++++----------
 pymc3/step_methods/pgbart.py     |  2 +-
 6 files changed, 86 insertions(+), 60 deletions(-)

diff --git a/pymc3/aesaraf.py b/pymc3/aesaraf.py
index 77b1c0ec7b..eb16e3142f 100644
--- a/pymc3/aesaraf.py
+++ b/pymc3/aesaraf.py
@@ -11,17 +11,7 @@
 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
-from typing import (
-    Callable,
-    Dict,
-    Generator,
-    Iterable,
-    List,
-    Optional,
-    Set,
-    Tuple,
-    Union,
-)
+from typing import Dict, List
 
 import aesara
 import aesara.tensor as at
@@ -572,17 +562,20 @@ def join_nonshared_inputs(
         tensor_type = joined.type
         inarray = tensor_type("inarray")
     else:
-        inarray = aesara.shared(joined.tag.test_value, "inarray")
+        if point is None:
+            raise ValueError("A point is required when `make_shared` is True")
+        joined_values = np.concatenate([point[var.name].ravel() for var in vars])
+        inarray = aesara.shared(joined_values, "inarray")
 
-    inarray.tag.test_value = joined.tag.test_value
+    if aesara.config.compute_test_value != "off":
+        inarray.tag.test_value = joined.tag.test_value
 
     replace = {}
     last_idx = 0
     for var in vars:
-        arr_len = aet.prod(var.shape)
-        replace[var] = reshape_t(inarray[last_idx : last_idx + arr_len], var.shape).astype(
-            var.dtype
-        )
+        shape = point[var.name].shape
+        arr_len = np.prod(shape, dtype=int)
+        replace[var] = reshape_t(inarray[last_idx : last_idx + arr_len], shape).astype(var.dtype)
         last_idx += arr_len
 
     replace.update(shared)
diff --git a/pymc3/smc/smc.py b/pymc3/smc/smc.py
index 07470dadf8..fe5a30179e 100644
--- a/pymc3/smc/smc.py
+++ b/pymc3/smc/smc.py
@@ -108,14 +108,14 @@ def initialize_population(self):
 
     def setup_kernel(self):
         """Set up the likelihood logp function based on the chosen kernel."""
-        initial_values = self.model.initial_point
+        initial_values = self.model.test_point
         shared = make_shared_replacements(initial_values, self.variables, self.model)
 
         if self.kernel == "abc":
             factors = [var.logpt for var in self.model.free_RVs]
-            factors += [at.sum(factor) for factor in self.model.potentials]
+            factors += [aet.sum(factor) for factor in self.model.potentials]
             self.prior_logp_func = logp_forw(
-                initial_values, [at.sum(factors)], self.variables, shared
+                initial_values, [aet.sum(factors)], self.variables, shared
             )
             simulator = self.model.observed_RVs[0]
             distance = simulator.distribution.distance
diff --git a/pymc3/step_methods/arraystep.py b/pymc3/step_methods/arraystep.py
index 6d765ca529..1def8c677c 100644
--- a/pymc3/step_methods/arraystep.py
+++ b/pymc3/step_methods/arraystep.py
@@ -149,11 +149,20 @@ def step(self, point: Dict[str, np.ndarray]):
         step_res = self.astep(apoint, *inputs)
 
         if self.generates_stats:
-            apoint, stats = self.astep(DictToArrayBijection.map(point), *inputs)
-            return DictToArrayBijection.rmap(apoint), stats
+            apoint_new, stats = step_res
         else:
-            apoint = self.astep(DictToArrayBijection.map(point), *inputs)
-            return DictToArrayBijection.rmap(apoint)
+            apoint_new = step_res
+
+        if not isinstance(apoint_new, RaveledVars):
+            # We assume that the mapping has stayed the same
+            apoint_new = RaveledVars(apoint_new, apoint.point_map_info)
+
+        point_new = DictToArrayBijection.rmap(apoint_new)
+
+        if self.generates_stats:
+            return point_new, stats
+
+        return point_new
 
     def astep(self, apoint, point):
         raise NotImplementedError()
@@ -181,16 +190,40 @@ def __init__(self, vars, shared, blocked=True):
 
     def step(self, point):
 
+        # Remove shared variables from the sample point
+        point_no_shared = point.copy()
+        for name, shared_var in self.shared.items():
+            shared_var.set_value(point[name])
+            if name in point_no_shared:
+                del point_no_shared[name]
+
+        q = DictToArrayBijection.map(point_no_shared)
+
+        step_res = self.astep(q)
+
         if self.generates_stats:
-            apoint, stats = self.astep(DictToArrayBijection.map(point))
-            return DictToArrayBijection.rmap(apoint), stats
+            apoint, stats = step_res
         else:
-            array = DictToArrayBijection.map(point)
-            apoint = self.astep(array)
-            if not isinstance(apoint, RaveledVars):
-                # We assume that the mapping has stayed the same
-                apoint = RaveledVars(apoint, array.point_map_info)
-            return DictToArrayBijection.rmap(apoint)
+            apoint = step_res
+
+        if not isinstance(apoint, RaveledVars):
+            # We assume that the mapping has stayed the same
+            apoint = RaveledVars(apoint, q.point_map_info)
+
+        # We need to re-add the shared variables to the new sample point
+        a_point = DictToArrayBijection.rmap(apoint)
+        new_point = {}
+        for name in point.keys():
+            shared_value = self.shared.get(name, None)
+            if shared_value is not None:
+                new_point[name] = shared_value.get_value()
+            else:
+                new_point[name] = a_point[name]
+
+        if self.generates_stats:
+            return new_point, stats
+
+        return new_point
 
     def astep(self, apoint):
         raise NotImplementedError()
diff --git a/pymc3/step_methods/metropolis.py b/pymc3/step_methods/metropolis.py
index 13e7f0d84f..422f4fa9be 100644
--- a/pymc3/step_methods/metropolis.py
+++ b/pymc3/step_methods/metropolis.py
@@ -24,7 +24,7 @@
 import pymc3 as pm
 
 from pymc3.aesaraf import floatX
-from pymc3.blocking import DictToArrayBijection
+from pymc3.blocking import DictToArrayBijection, RaveledVars
 from pymc3.step_methods.arraystep import (
     ArrayStep,
     ArrayStepShared,
@@ -150,15 +150,14 @@ def __init__(
         """
 
         model = pm.modelcontext(model)
-        initial_values = model.initial_point
+        initial_values = model.test_point
 
         if vars is None:
             vars = model.value_vars
         vars = pm.inputvars(vars)
 
         if S is None:
-            # XXX: This needs to be refactored
-            S = None  # np.ones(sum(v.dsize for v in vars))
+            S = np.ones(sum(initial_values[v.name].size for v in vars))
 
         if proposal_dist is not None:
             self.proposal_dist = proposal_dist(S)
@@ -177,8 +176,7 @@ def __init__(
 
         # Determine type of variables
         self.discrete = np.concatenate(
-            # XXX: This needs to be refactored
-            None  # [[v.dtype in pm.discrete_types] * (v.dsize or 1) for v in vars]
+            [[v.dtype in pm.discrete_types] * (initial_values[v.name].size or 1) for v in vars]
         )
         self.any_discrete = self.discrete.any()
         self.all_discrete = self.discrete.all()
@@ -561,6 +559,8 @@ def astep_unif(self, q0: RaveledVars, logp) -> RaveledVars:
             if accepted:
                 logp_curr = logp_prop
 
+        q = RaveledVars(q, point_map_info)
+
         return q
 
     def astep_prop(self, q0: RaveledVars, logp) -> RaveledVars:
@@ -578,6 +578,8 @@ def astep_prop(self, q0: RaveledVars, logp) -> RaveledVars:
         for dim, k in dimcats:
             logp_curr = self.metropolis_proportional(q, logp, logp_curr, dim, k)
 
+        q = RaveledVars(q, point_map_info)
+
         return q
 
     def metropolis_proportional(self, q, logp, logp_curr, dim, k):
@@ -693,8 +695,7 @@ def __init__(
     ):
 
         model = pm.modelcontext(model)
-        initial_values = model.initial_point
-        initial_values_size = sum(initial_values[n.name].size for n in model.value_vars)
+        initial_values = model.test_point
 
         if vars is None:
             vars = model.cont_vars
@@ -842,8 +843,7 @@ def __init__(
         **kwargs
     ):
         model = pm.modelcontext(model)
-        initial_values = model.initial_point
-        initial_values_size = sum(initial_values[n.name].size for n in model.value_vars)
+        initial_values = model.test_point
 
         if vars is None:
             vars = model.cont_vars
diff --git a/pymc3/step_methods/mlda.py b/pymc3/step_methods/mlda.py
index 77a9e76b84..443ad7e7c2 100644
--- a/pymc3/step_methods/mlda.py
+++ b/pymc3/step_methods/mlda.py
@@ -58,7 +58,7 @@ def __init__(self, *args, **kwargs):
         and some extra code specific for MLDA.
         """
         model = pm.modelcontext(kwargs.get("model", None))
-        initial_values = model.initial_point
+        initial_values = model.test_point
 
         # flag to that variance reduction is activated - forces MetropolisMLDA
         # to store quantities of interest in a register if True
@@ -71,18 +71,18 @@ def __init__(self, *args, **kwargs):
             self.Q_reg = [np.nan] * self.mlda_subsampling_rate_above
 
             # extract some necessary variables
-            value_vars = kwargs.get("vars", None)
-            if value_vars is None:
-                value_vars = model.value_vars
-            value_vars = pm.inputvars(value_vars)
-            shared = pm.make_shared_replacements(initial_values, value_vars, model)
+            vars = kwargs.get("vars", None)
+            if vars is None:
+                vars = model.vars
+            vars = pm.inputvars(vars)
+            shared = pm.make_shared_replacements(initial_values, vars, model)
 
         # call parent class __init__
         super().__init__(*args, **kwargs)
 
         # modify the delta function and point to model if VR is used
         if self.mlda_variance_reduction:
-            self.delta_logp = delta_logp_inverse(initial_values, model.logpt, value_vars, shared)
+            self.delta_logp = delta_logp_inverse(initial_values, model.logpt, vars, shared)
             self.model = model
 
     def reset_tuning(self):
@@ -126,7 +126,7 @@ def __init__(self, *args, **kwargs):
         self.tuning_end_trigger = False
 
         model = pm.modelcontext(kwargs.get("model", None))
-        initial_values = model.initial_point
+        initial_values = model.test_point
 
         # flag to that variance reduction is activated - forces DEMetropolisZMLDA
         # to store quantities of interest in a register if True
@@ -139,18 +139,18 @@ def __init__(self, *args, **kwargs):
             self.Q_reg = [np.nan] * self.mlda_subsampling_rate_above
 
             # extract some necessary variables
-            value_vars = kwargs.get("vars", None)
-            if value_vars is None:
-                value_vars = model.value_vars
-            value_vars = pm.inputvars(value_vars)
-            shared = pm.make_shared_replacements(initial_values, value_vars, model)
+            vars = kwargs.get("vars", None)
+            if vars is None:
+                vars = model.vars
+            vars = pm.inputvars(vars)
+            shared = pm.make_shared_replacements(initial_values, vars, model)
 
         # call parent class __init__
         super().__init__(*args, **kwargs)
 
         # modify the delta function and point to model if VR is used
         if self.mlda_variance_reduction:
-            self.delta_logp = delta_logp_inverse(initial_values, model.logpt, value_vars, shared)
+            self.delta_logp = delta_logp_inverse(initial_values, model.logpt, vars, shared)
             self.model = model
 
     def reset_tuning(self):
@@ -403,7 +403,7 @@ def __init__(
 
         # assign internal state
         model = pm.modelcontext(model)
-        initial_values = model.initial_point
+        initial_values = model.test_point
         self.model = model
         self.coarse_models = coarse_models
         self.model_below = self.coarse_models[-1]
@@ -557,8 +557,8 @@ def __init__(
 
         # Construct aesara function for current-level model likelihood
         # (for use in acceptance)
-        shared = pm.make_shared_replacements(initial_values, value_vars, model)
-        self.delta_logp = delta_logp_inverse(initial_values, model.logpt, value_vars, shared)
+        shared = pm.make_shared_replacements(initial_values, vars, model)
+        self.delta_logp = delta_logp_inverse(initial_values, model.logpt, vars, shared)
 
         # Construct aesara function for below-level model likelihood
         # (for use in acceptance)
diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py
index b3b00bfa52..043f511c72 100644
--- a/pymc3/step_methods/pgbart.py
+++ b/pymc3/step_methods/pgbart.py
@@ -59,7 +59,7 @@ class PGBART(ArrayStepShared):
     def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", model=None):
         _log.warning("The BART model is experimental. Use with caution.")
         model = modelcontext(model)
-        initial_values = model.initial_point
+        initial_values = model.test_point
         vars = inputvars(vars)
         self.bart = vars[0].distribution
 

From 886e3f85b4cac3cebd309a79aa7e9781d4cbc6f2 Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Tue, 23 Mar 2021 23:26:58 -0500
Subject: [PATCH 22/44] Set default transform for Dirichlet

---
 pymc3/distributions/multivariate.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pymc3/distributions/multivariate.py b/pymc3/distributions/multivariate.py
index 803a8faf06..d368738646 100644
--- a/pymc3/distributions/multivariate.py
+++ b/pymc3/distributions/multivariate.py
@@ -387,8 +387,12 @@ class Dirichlet(Continuous):
 
     rv_op = dirichlet
 
+    def __new__(cls, name, *args, **kwargs):
+        kwargs.setdefault("transform", transforms.stick_breaking)
+        return super().__new__(cls, name, *args, **kwargs)
+
     @classmethod
-    def dist(cls, a, transform=transforms.stick_breaking, **kwargs):
+    def dist(cls, a, **kwargs):
 
         a = aet.as_tensor_variable(a)
         # mean = a / aet.sum(a)

From ea324a360d9b016d2cfeec164f4f4cd08410d6dd Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Tue, 23 Mar 2021 23:27:13 -0500
Subject: [PATCH 23/44] Normalize Multinomial argument

---
 pymc3/distributions/multivariate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pymc3/distributions/multivariate.py b/pymc3/distributions/multivariate.py
index d368738646..3a506f67a1 100644
--- a/pymc3/distributions/multivariate.py
+++ b/pymc3/distributions/multivariate.py
@@ -512,7 +512,7 @@ class Multinomial(Discrete):
     @classmethod
     def dist(cls, n, p, *args, **kwargs):
 
-        # p = p / aet.sum(p, axis=-1, keepdims=True)
+        p = p / aet.sum(p, axis=-1, keepdims=True)
         n = aet.as_tensor_variable(n)
         p = aet.as_tensor_variable(p)
 

From 9ace1828894263f947a7173eaa2c819cd5ed208c Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Wed, 24 Mar 2021 00:08:58 -0500
Subject: [PATCH 24/44] Fix Interval.jacobian_det

---
 pymc3/distributions/transforms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pymc3/distributions/transforms.py b/pymc3/distributions/transforms.py
index 5be28a5cde..4e49a27f45 100644
--- a/pymc3/distributions/transforms.py
+++ b/pymc3/distributions/transforms.py
@@ -215,7 +215,7 @@ def jacobian_det(self, rv_var, rv_value):
             s = aet.nnet.softplus(-rv_value)
             return aet.log(b - a) - 2 * s - rv_value
         else:
-            return aet.ones_like(rv_value)
+            return rv_value
 
 
 interval = Interval

From 134c90e8cfc2cce6437e32aaf077395896883c1f Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Wed, 24 Mar 2021 00:09:28 -0500
Subject: [PATCH 25/44] Fix Stickbreaking scalar condition

---
 pymc3/distributions/transforms.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pymc3/distributions/transforms.py b/pymc3/distributions/transforms.py
index 4e49a27f45..2fd152d458 100644
--- a/pymc3/distributions/transforms.py
+++ b/pymc3/distributions/transforms.py
@@ -283,7 +283,7 @@ class StickBreaking(Transform):
     name = "stickbreaking"
 
     def forward(self, rv_var, rv_value):
-        if rv_var.ndim == 1 or rv_var.broadcastable[-1]:
+        if rv_var.broadcastable[-1]:
             # If this variable is just a bunch of scalars/degenerate
             # Dirichlets, we can't transform it
             return rv_value
@@ -296,7 +296,7 @@ def forward(self, rv_var, rv_value):
         return floatX(y.T)
 
     def backward(self, rv_var, rv_value):
-        if rv_var.ndim == 1 or rv_var.broadcastable[-1]:
+        if rv_var.broadcastable[-1]:
             # If this variable is just a bunch of scalars/degenerate
             # Dirichlets, we can't transform it
             return rv_value
@@ -309,7 +309,7 @@ def backward(self, rv_var, rv_value):
         return floatX(x.T)
 
     def jacobian_det(self, rv_var, rv_value):
-        if rv_var.ndim == 1 or rv_var.broadcastable[-1]:
+        if rv_var.broadcastable[-1]:
             # If this variable is just a bunch of scalars/degenerate
             # Dirichlets, we can't transform it
             return aet.ones_like(rv_value)

From fcd277c724d99fcb20631795f3d1224327afa5fc Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Wed, 24 Mar 2021 00:13:26 -0500
Subject: [PATCH 26/44] Make v4 compatibility changes to
 pymc3.tests.test_sampling

---
 pymc3/tests/test_sampling.py | 83 +++++++++++++++++++-----------------
 1 file changed, 43 insertions(+), 40 deletions(-)

diff --git a/pymc3/tests/test_sampling.py b/pymc3/tests/test_sampling.py
index 5cc7bba127..d857b4e544 100644
--- a/pymc3/tests/test_sampling.py
+++ b/pymc3/tests/test_sampling.py
@@ -384,7 +384,7 @@ def test_shared_named(self):
                 testval=np.atleast_2d(0),
             )
             theta = pm.Normal(
-                "theta", mu=at.dot(G_var, theta0), tau=np.atleast_2d(1e20), size=(1, 1)
+                "theta", mu=aet.dot(G_var, theta0), tau=np.atleast_2d(1e20), size=(1, 1)
             )
             res = theta.eval()
             assert np.isclose(res, 0.0)
@@ -400,7 +400,7 @@ def test_shared_unnamed(self):
                 testval=np.atleast_2d(0),
             )
             theta = pm.Normal(
-                "theta", mu=at.dot(G_var, theta0), tau=np.atleast_2d(1e20), size=(1, 1)
+                "theta", mu=aet.dot(G_var, theta0), tau=np.atleast_2d(1e20), size=(1, 1)
             )
             res = theta.eval()
             assert np.isclose(res, 0.0)
@@ -416,7 +416,7 @@ def test_constant_named(self):
                 testval=np.atleast_2d(0),
             )
             theta = pm.Normal(
-                "theta", mu=at.dot(G_var, theta0), tau=np.atleast_2d(1e20), size=(1, 1)
+                "theta", mu=aet.dot(G_var, theta0), tau=np.atleast_2d(1e20), size=(1, 1)
             )
 
             res = theta.eval()
@@ -451,7 +451,7 @@ def test_normal_scalar(self):
         with model:
             # test list input
             ppc0 = pm.sample_posterior_predictive([model.test_point], samples=10)
-            # deprecated argument is not introduced to fast version [2019/08/20:rpg]
+            # # deprecated argument is not introduced to fast version [2019/08/20:rpg]
             ppc = pm.sample_posterior_predictive(trace, var_names=["a"])
             # test empty ppc
             ppc = pm.sample_posterior_predictive(trace, var_names=[])
@@ -461,11 +461,6 @@ def test_normal_scalar(self):
             ppc = pm.sample_posterior_predictive(trace, keep_size=True)
             assert ppc["a"].shape == (nchains, ndraws)
 
-            # test keep_size parameter and idata input
-            idata = az.from_pymc3(trace)
-            ppc = pm.sample_posterior_predictive(idata, keep_size=True)
-            assert ppc["a"].shape == (nchains, ndraws)
-
             # test default case
             ppc = pm.sample_posterior_predictive(trace, var_names=["a"])
             assert "a" in ppc
@@ -479,6 +474,7 @@ def test_normal_scalar(self):
             ppc = pm.sample_posterior_predictive(trace, size=5, var_names=["a"])
             assert ppc["a"].shape == (nchains * ndraws, 5)
 
+    @pytest.mark.xfail(reason="Arviz not refactored for v4")
     def test_normal_scalar_idata(self):
         nchains = 2
         ndraws = 500
@@ -486,19 +482,12 @@ def test_normal_scalar_idata(self):
             mu = pm.Normal("mu", 0.0, 1.0)
             a = pm.Normal("a", mu=mu, sigma=1, observed=0.0)
             trace = pm.sample(
-                draws=ndraws,
-                chains=nchains,
-                return_inferencedata=False,
-                discard_tuned_samples=False,
+                draws=ndraws, chains=nchains, return_inferencedata=True, discard_tuned_samples=False
             )
 
-        assert not isinstance(trace, InferenceData)
-
         with model:
             # test keep_size parameter and idata input
-            idata = pm.to_inference_data(trace)
-            assert isinstance(idata, InferenceData)
-
+            idata = az.from_pymc3(trace)
             ppc = pm.sample_posterior_predictive(idata, keep_size=True)
             assert ppc["a"].shape == (nchains, ndraws)
 
@@ -532,19 +521,16 @@ def test_normal_vector(self, caplog):
             assert "a" in ppc
             assert ppc["a"].shape == (10, 4, 2)
 
+    @pytest.mark.xfail(reason="Arviz not refactored for v4")
     def test_normal_vector_idata(self, caplog):
         with pm.Model() as model:
             mu = pm.Normal("mu", 0.0, 1.0)
             a = pm.Normal("a", mu=mu, sigma=1, observed=np.array([0.5, 0.2]))
             trace = pm.sample(return_inferencedata=False)
 
-        assert not isinstance(trace, InferenceData)
-
         with model:
             # test keep_size parameter with inference data as input...
-            idata = pm.to_inference_data(trace)
-            assert isinstance(idata, InferenceData)
-
+            idata = az.from_pymc3(trace)
             ppc = pm.sample_posterior_predictive(idata, keep_size=True)
             assert ppc["a"].shape == (trace.nchains, len(trace), 2)
 
@@ -603,6 +589,7 @@ def test_sum_normal(self):
             _, pval = stats.kstest(ppc["b"], stats.norm(scale=scale).cdf)
             assert pval > 0.001
 
+    @pytest.mark.xfail(reason="HalfFlat not refactored for v4")
     def test_model_not_drawable_prior(self):
         data = np.random.poisson(lam=10, size=200)
         model = pm.Model()
@@ -670,6 +657,17 @@ def test_deterministic_of_observed(self):
 
             rtol = 1e-5 if aesara.config.floatX == "float64" else 1e-4
 
+            model.default_rng.get_value(borrow=True).seed(0)
+            ppc = pm.sample_posterior_predictive(
+                model=model,
+                trace=trace,
+                samples=len(trace) * nchains,
+                random_seed=0,
+                var_names=[var.name for var in (model.deterministics + model.basic_RVs)],
+            )
+
+            rtol = 1e-5 if aesara.config.floatX == "float64" else 1e-4
+
     def test_deterministic_of_observed_modified_interface(self):
         np.random.seed(4982)
 
@@ -740,14 +738,14 @@ def test_sample_posterior_predictive_w(self):
             y = pm.Normal("y", mu=mu, sigma=1, observed=data0)
             with pytest.warns(UserWarning, match=warning_msg):
                 trace_0 = pm.sample(10, tune=0, chains=2, return_inferencedata=False)
-            idata_0 = pm.to_inference_data(trace_0, log_likelihood=False)
+            idata_0 = az.from_pymc3(trace_0, log_likelihood=False)
 
         with pm.Model() as model_1:
             mu = pm.Normal("mu", mu=0, sigma=1, size=len(data0))
             y = pm.Normal("y", mu=mu, sigma=1, observed=data0)
             with pytest.warns(UserWarning, match=warning_msg):
                 trace_1 = pm.sample(10, tune=0, chains=2, return_inferencedata=False)
-            idata_1 = pm.to_inference_data(trace_1, log_likelihood=False)
+            idata_1 = az.from_pymc3(trace_1, log_likelihood=False)
 
         with pm.Model() as model_2:
             # Model with no observed RVs.
@@ -820,15 +818,6 @@ def check_exec_nuts_init(method):
         "ADVI+adapt_diag",
         "advi+adapt_diag_grad",
         "advi_map",
-    ],
-)
-def test_exec_nuts_advi_init(method):
-    check_exec_nuts_init(method)
-
-
-@pytest.mark.parametrize(
-    "method",
-    [
         "jitter+adapt_diag",
         "adapt_diag",
         "map",
@@ -836,8 +825,22 @@ def test_exec_nuts_advi_init(method):
         "jitter+adapt_full",
     ],
 )
+@pytest.mark.xfail(reason="ADVI not refactored for v4", exception=NotImplementedError)
 def test_exec_nuts_init(method):
-    check_exec_nuts_init(method)
+    with pm.Model() as model:
+        pm.Normal("a", mu=0, sigma=1, size=2)
+        pm.HalfNormal("b", sigma=1)
+    with model:
+        start, _ = pm.init_nuts(init=method, n_init=10)
+        assert isinstance(start, list)
+        assert len(start) == 1
+        assert isinstance(start[0], dict)
+        assert "a" in start[0] and "b" in start[0]
+        start, _ = pm.init_nuts(init=method, n_init=10, chains=2)
+        assert isinstance(start, list)
+        assert len(start) == 2
+        assert isinstance(start[0], dict)
+        assert "a" in start[0] and "b" in start[0]
 
 
 @pytest.mark.parametrize(
@@ -954,16 +957,14 @@ def test_multivariate2(self):
         sim_priors = pm.sample_prior_predictive(samples=20, model=dm_model)
         sim_ppc = pm.sample_posterior_predictive(burned_trace, samples=20, model=dm_model)
         assert sim_priors["probs"].shape == (20, 6)
-        assert sim_priors["obs"].shape == (20,) + obs.distribution.shape
-        assert sim_ppc["obs"].shape == (20,) + obs.distribution.shape
+        assert sim_priors["obs"].shape == (20,) + mn_data.shape
+        assert sim_ppc["obs"].shape == (20,) + mn_data.shape
 
     def test_layers(self):
         with pm.Model() as model:
             a = pm.Uniform("a", lower=0, upper=1, size=10)
             b = pm.Binomial("b", n=1, p=a, size=10)
 
-        model.default_rng.get_value(borrow=True).seed(232093)
-
         b_sampler = aesara.function([], b)
         avg = np.stack([b_sampler() for i in range(10000)]).mean(0)
         npt.assert_array_almost_equal(avg, 0.5 * np.ones((10,)), decimal=2)
@@ -1047,7 +1048,7 @@ def test_zeroinflatedpoisson(self):
     def test_bounded_dist(self):
         with pm.Model() as model:
             BoundedNormal = pm.Bound(pm.Normal, lower=0.0)
-            x = BoundedNormal("x", mu=at.zeros((3, 1)), sd=1 * at.ones((3, 1)), size=(3, 1))
+            x = BoundedNormal("x", mu=aet.zeros((3, 1)), sd=1 * aet.ones((3, 1)), size=(3, 1))
 
         with model:
             prior_trace = pm.sample_prior_predictive(5)
@@ -1070,6 +1071,7 @@ def test_point_list_arg_bug_spp(self, point_list_arg_bug_fixture):
         with pmodel:
             pp = pm.sample_posterior_predictive([trace[15]], var_names=["d"])
 
+    @pytest.mark.xfail(reason="Arviz not refactored for v4")
     def test_sample_from_xarray_prior(self, point_list_arg_bug_fixture):
         pmodel, trace = point_list_arg_bug_fixture
 
@@ -1081,6 +1083,7 @@ def test_sample_from_xarray_prior(self, point_list_arg_bug_fixture):
         with pmodel:
             pp = pm.sample_posterior_predictive(idat.prior, var_names=["d"])
 
+    @pytest.mark.xfail(reason="Arviz not refactored for v4")
     def test_sample_from_xarray_posterior(self, point_list_arg_bug_fixture):
         pmodel, trace = point_list_arg_bug_fixture
         idat = pm.to_inference_data(trace)

From e829c194f922fe3e0fdacaed53ccd0cb2c199f06 Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Wed, 24 Mar 2021 00:20:47 -0500
Subject: [PATCH 27/44] Make pymc3.tests.test_transforms work with None RV
 variables

---
 pymc3/tests/test_transforms.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pymc3/tests/test_transforms.py b/pymc3/tests/test_transforms.py
index d473906d40..404dcfac92 100644
--- a/pymc3/tests/test_transforms.py
+++ b/pymc3/tests/test_transforms.py
@@ -65,6 +65,8 @@ def check_vector_transform(transform, domain, rv_var=None):
 def get_values(transform, domain=R, constructor=aet.dscalar, test=0, rv_var=None):
     x = constructor("x")
     x.tag.test_value = test
+    if rv_var is None:
+        rv_var = x
     f = aesara.function([x], transform.backward(rv_var, x))
     return np.array([f(val) for val in domain.vals])
 
@@ -81,6 +83,9 @@ def check_jacobian_det(
     y = constructor("y")
     y.tag.test_value = test
 
+    if rv_var is None:
+        rv_var = y
+
     x = transform.backward(rv_var, y)
     if make_comparable:
         x = make_comparable(x)
@@ -127,7 +132,7 @@ def test_stickbreaking_accuracy():
     x = at.dvector("x")
     x.tag.test_value = val
     identity_f = aesara.function(
-        [x], tr.stick_breaking.forward(None, tr.stick_breaking.backward(None, x))
+        [x], tr.stick_breaking.forward(x, tr.stick_breaking.backward(x, x))
     )
     close_to(val, identity_f(val), tol)
 

From c937600cfca1fc9f6b830718796d011e8e797509 Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Wed, 24 Mar 2021 20:48:01 -0500
Subject: [PATCH 28/44] Fix MvNormal quaddist_matrix parameter order

---
 pymc3/distributions/multivariate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pymc3/distributions/multivariate.py b/pymc3/distributions/multivariate.py
index 3a506f67a1..501636e6de 100644
--- a/pymc3/distributions/multivariate.py
+++ b/pymc3/distributions/multivariate.py
@@ -222,7 +222,7 @@ class MvNormal(Continuous):
     @classmethod
     def dist(cls, mu, cov=None, tau=None, chol=None, lower=True, **kwargs):
         mu = aet.as_tensor_variable(mu)
-        cov = quaddist_matrix(cov, tau, chol, lower)
+        cov = quaddist_matrix(cov, chol, tau, lower)
         return super().dist([mu, cov], **kwargs)
 
     def logp(value, mu, cov):

From 60a90be13248b9b2d6e646c3ea27fba7d325cffe Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Wed, 24 Mar 2021 20:51:07 -0500
Subject: [PATCH 29/44] Enable MvNormal tests in test_distributions

---
 pymc3/tests/test_distributions.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
index acd7b1e2bb..d8966205c0 100644
--- a/pymc3/tests/test_distributions.py
+++ b/pymc3/tests/test_distributions.py
@@ -227,10 +227,16 @@ def build_model(distfam, valuedomain, vardomains, extra_args=None):
     with Model() as m:
         param_vars = {}
         for v, dom in vardomains.items():
-            vals[v] = dom.vals[0]
-        vals.update(extra_args)
-        distfam("value", size=valuedomain.shape, transform=None, **vals)
-    return m
+            v_at = aesara.shared(np.asarray(dom.vals[0]))
+            v_at.name = v
+            param_vars[v] = v_at
+        param_vars.update(extra_args)
+        distfam(
+            "value",
+            **param_vars,
+            transform=None,
+        )
+    return m, param_vars
 
 
 def laplace_asymmetric_logpdf(value, kappa, b, mu):
@@ -1767,7 +1773,6 @@ def test_mvnormal(self, n):
         condition=(aesara.config.floatX == "float32"),
         reason="Fails on float32 due to inf issues",
     )
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_mvnormal_indef(self):
         cov_val = np.array([[1, 0.5], [0.5, -2]])
         cov = at.matrix("cov")
@@ -1782,14 +1787,13 @@ def test_mvnormal_indef(self):
         f_dlogp = aesara.function([cov, x], dlogp)
         assert not np.all(np.isfinite(f_dlogp(cov_val, np.ones(2))))
 
-        logp = logp(MvNormal.dist(mu=mu, tau=cov), x)
+        logp = logpt(MvNormal.dist(mu=mu, tau=cov), x)
         f_logp = aesara.function([cov, x], logp)
         assert f_logp(cov_val, np.ones(2)) == -np.inf
         dlogp = at.grad(logp, cov)
         f_dlogp = aesara.function([cov, x], dlogp)
         assert not np.all(np.isfinite(f_dlogp(cov_val, np.ones(2))))
 
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_mvnormal_init_fail(self):
         with Model():
             with pytest.raises(ValueError):

From 931c494aac3a2e6d93097098e299824de6255218 Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Wed, 24 Mar 2021 23:43:11 -0500
Subject: [PATCH 30/44] Fix NegativeBinomial parameterization and enable its
 tests

---
 pymc3/distributions/discrete.py   | 39 ++++++++++++++-----------------
 pymc3/tests/test_distributions.py |  1 -
 2 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py
index af1234307a..cb59ae36f9 100644
--- a/pymc3/distributions/discrete.py
+++ b/pymc3/distributions/discrete.py
@@ -731,35 +731,33 @@ def NegBinom(a, m, x):
 
     @classmethod
     def dist(cls, mu=None, alpha=None, p=None, n=None, *args, **kwargs):
-        mu, alpha = cls.get_mu_alpha(mu, alpha, p, n)
-        mu = aet.as_tensor_variable(floatX(mu))
-        alpha = aet.as_tensor_variable(floatX(alpha))
-        # mode = intX(aet.floor(mu))
-        return super().dist([mu, alpha], *args, **kwargs)
+        n, p = cls.get_mu_alpha(mu, alpha, p, n)
+        n = aet.as_tensor_variable(floatX(n))
+        p = aet.as_tensor_variable(floatX(p))
+        return super().dist([n, p], *args, **kwargs)
 
     @classmethod
     def get_mu_alpha(cls, mu=None, alpha=None, p=None, n=None):
-        if alpha is None:
-            if n is not None:
-                n = aet.as_tensor_variable(intX(n))
-                alpha = n
+        if n is None:
+            if alpha is not None:
+                n = aet.as_tensor_variable(floatX(alpha))
             else:
                 raise ValueError("Incompatible parametrization. Must specify either alpha or n.")
         elif alpha is not None:
             raise ValueError("Incompatible parametrization. Can't specify both alpha and n.")
 
-        if mu is None:
-            if p is not None:
-                p = aet.as_tensor_variable(floatX(p))
-                mu = alpha * (1 - p) / p
+        if p is None:
+            if mu is not None:
+                mu = aet.as_tensor_variable(floatX(mu))
+                p = n / (mu + n)
             else:
                 raise ValueError("Incompatible parametrization. Must specify either mu or p.")
         elif mu is not None:
             raise ValueError("Incompatible parametrization. Can't specify both mu and p.")
 
-        return mu, alpha
+        return n, p
 
-    def logp(value, mu, alpha):
+    def logp(value, n, p):
         r"""
         Calculate log-probability of NegativeBinomial distribution at specified value.
 
@@ -773,6 +771,8 @@ def logp(value, mu, alpha):
         -------
         TensorVariable
         """
+        alpha = n
+        mu = alpha * (1 - p) / p
         negbinom = bound(
             binomln(value + alpha - 1, value)
             + logpow(mu / (mu + alpha), value)
@@ -783,9 +783,9 @@ def logp(value, mu, alpha):
         )
 
         # Return Poisson when alpha gets very large.
-        return aet.switch(aet.gt(alpha, 1e10), Poisson.dist(mu).logp(value), negbinom)
+        return aet.switch(aet.gt(alpha, 1e10), Poisson.logp(value, mu), negbinom)
 
-    def logcdf(value, mu, alpha):
+    def logcdf(value, n, p):
         """
         Compute the log of the cumulative distribution function for NegativeBinomial distribution
         at the specified value.
@@ -805,11 +805,8 @@ def logcdf(value, mu, alpha):
                 f"NegativeBinomial.logcdf expects a scalar value but received a {np.ndim(value)}-dimensional object."
             )
 
-        # TODO: avoid `p` recomputation if distribution was defined in terms of `p`
-        p = alpha / (mu + alpha)
-
         return bound(
-            at.log(incomplete_beta(n, at.floor(value) + 1, p)),
+            aet.log(incomplete_beta(n, aet.floor(value) + 1, p)),
             0 <= value,
             0 < n,
             0 <= p,
diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
index d8966205c0..5e36636afa 100644
--- a/pymc3/tests/test_distributions.py
+++ b/pymc3/tests/test_distributions.py
@@ -1224,7 +1224,6 @@ def modified_scipy_hypergeom_logcdf(value, N, k, n):
             {"N": NatSmall, "k": NatSmall, "n": NatSmall},
         )
 
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_negative_binomial(self):
         def scipy_mu_alpha_logpmf(value, mu, alpha):
             return sp.nbinom.logpmf(value, alpha, 1 - mu / (mu + alpha))

From 7bfcebfe0fcd192c1fb334b9a5015ecc2dba6601 Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Thu, 25 Mar 2021 00:07:23 -0500
Subject: [PATCH 31/44] Prevent SciPy error by using float64 point in
 test_dirichlet_with_batch_shapes

---
 pymc3/tests/test_distributions.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
index 5e36636afa..798796d53a 100644
--- a/pymc3/tests/test_distributions.py
+++ b/pymc3/tests/test_distributions.py
@@ -1977,8 +1977,11 @@ def test_dirichlet_with_batch_shapes(self, dist_shape):
         with pm.Model() as model:
             d = pm.Dirichlet("d", a=a)
 
+        # Generate sample points to test
         d_value = d.tag.value_var
-        d_point = d.eval()
+        d_point = d.eval().astype("float64")
+        d_point /= d_point.sum(axis=-1)[..., None]
+
         if hasattr(d_value.tag, "transform"):
             d_point_trans = d_value.tag.transform.forward(d, d_point).eval()
         else:

From 90ed8e116829af93de8fc384af0bbf78f2760a40 Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Thu, 25 Mar 2021 19:49:18 -0500
Subject: [PATCH 32/44] Create extract_obs_data function

---
 pymc3/aesaraf.py            | 262 +-----------------------------------
 pymc3/tests/test_aesaraf.py | 182 ++-----------------------
 2 files changed, 10 insertions(+), 434 deletions(-)

diff --git a/pymc3/aesaraf.py b/pymc3/aesaraf.py
index eb16e3142f..c894786fe1 100644
--- a/pymc3/aesaraf.py
+++ b/pymc3/aesaraf.py
@@ -20,19 +20,10 @@
 
 from aesara import config, scalar
 from aesara.gradient import grad
-from aesara.graph.basic import (
-    Apply,
-    Constant,
-    Variable,
-    clone_get_equiv,
-    graph_inputs,
-    walk,
-)
-from aesara.graph.fg import FunctionGraph
-from aesara.graph.op import Op, compute_test_value
+from aesara.graph.basic import Apply, Constant, graph_inputs
+from aesara.graph.op import Op
 from aesara.sandbox.rng_mrg import MRG_RandomStream as RandomStream
 from aesara.tensor.elemwise import Elemwise
-from aesara.tensor.random.op import RandomVariable
 from aesara.tensor.sharedvar import SharedVariable
 from aesara.tensor.subtensor import AdvancedIncSubtensor, AdvancedIncSubtensor1
 from aesara.tensor.var import TensorVariable
@@ -61,124 +52,6 @@
 ]
 
 
-def pandas_to_array(data):
-    """Convert a pandas object to a NumPy array.
-
-    XXX: When `data` is a generator, this will return a Aesara tensor!
-
-    """
-    if hasattr(data, "to_numpy") and hasattr(data, "isnull"):
-        # typically, but not limited to pandas objects
-        vals = data.to_numpy()
-        mask = data.isnull().to_numpy()
-        if mask.any():
-            # there are missing values
-            ret = np.ma.MaskedArray(vals, mask)
-        else:
-            ret = vals
-    elif isinstance(data, np.ndarray):
-        if isinstance(data, np.ma.MaskedArray):
-            if not data.mask.any():
-                # empty mask
-                ret = data.filled()
-            else:
-                # already masked and rightly so
-                ret = data
-        else:
-            # already a ndarray, but not masked
-            mask = np.isnan(data)
-            if np.any(mask):
-                ret = np.ma.MaskedArray(data, mask)
-            else:
-                # no masking required
-                ret = data
-    elif isinstance(data, Variable):
-        ret = data
-    elif sps.issparse(data):
-        ret = data
-    elif isgenerator(data):
-        ret = generator(data)
-    else:
-        ret = np.asarray(data)
-
-    # type handling to enable index variables when data is int:
-    if hasattr(data, "dtype"):
-        if "int" in str(data.dtype):
-            return intX(ret)
-        # otherwise, assume float:
-        else:
-            return floatX(ret)
-    # needed for uses of this function other than with pm.Data:
-    else:
-        return floatX(ret)
-
-
-def change_rv_size(
-    rv_var: TensorVariable,
-    new_size: PotentialShapeType,
-    expand: Optional[bool] = False,
-) -> TensorVariable:
-    """Change or expand the size of a `RandomVariable`.
-
-    Parameters
-    ==========
-    rv_var
-        The `RandomVariable` output.
-    new_size
-        The new size.
-    expand:
-        Whether or not to completely replace the `size` parameter in `rv_var`
-        with `new_size` or simply prepend it to the existing `size`.
-
-    """
-    rv_node = rv_var.owner
-    rng, size, dtype, *dist_params = rv_node.inputs
-    name = rv_var.name
-    tag = rv_var.tag
-
-    if expand:
-        new_size = tuple(np.atleast_1d(new_size)) + tuple(size)
-
-    new_rv_node = rv_node.op.make_node(rng, new_size, dtype, *dist_params)
-    rv_var = new_rv_node.outputs[-1]
-    rv_var.name = name
-    for k, v in tag.__dict__.items():
-        rv_var.tag.__dict__.setdefault(k, v)
-
-    if config.compute_test_value != "off":
-        compute_test_value(new_rv_node)
-
-    return rv_var
-
-
-def extract_rv_and_value_vars(
-    var: TensorVariable,
-) -> Tuple[TensorVariable, TensorVariable]:
-    """Extract a random variable and its corresponding value variable from a generic
-    `TensorVariable`.
-
-    Parameters
-    ==========
-    var
-        A variable corresponding to a `RandomVariable`.
-
-    Returns
-    =======
-    The first value in the tuple is the `RandomVariable`, and the second is the
-    measure-space variable that corresponds with the latter (i.e. the "value"
-    variable).
-
-    """
-    if not var.owner:
-        return None, None
-
-    if isinstance(var.owner.op, RandomVariable):
-        rv_value = getattr(var.tag, "observations", getattr(var.tag, "value_var", None))
-        return var, rv_value
-
-    return None, None
-
-
 def extract_obs_data(x: TensorVariable) -> np.ndarray:
     """Extract data observed symbolic variables.
 
@@ -201,137 +74,6 @@ def extract_obs_data(x: TensorVariable) -> np.ndarray:
     raise TypeError(f"Data cannot be extracted from {x}")
 
 
-def walk_model(
-    graphs: Iterable[TensorVariable],
-    walk_past_rvs: bool = False,
-    stop_at_vars: Optional[Set[TensorVariable]] = None,
-    expand_fn: Callable[[TensorVariable], Iterable[TensorVariable]] = lambda var: [],
-) -> Generator[TensorVariable, None, None]:
-    """Walk model graphs and yield their nodes.
-
-    By default, these walks will not go past ``RandomVariable`` nodes.
-
-    Parameters
-    ==========
-    graphs
-        The graphs to walk.
-    walk_past_rvs
-        If ``True``, the walk will not terminate at ``RandomVariable``s.
-    stop_at_vars
-        A list of variables at which the walk will terminate.
-    expand_fn
-        A function that returns the next variable(s) to be traversed.
-    """
-    if stop_at_vars is None:
-        stop_at_vars = set()
-
-    def expand(var):
-        new_vars = expand_fn(var)
-
-        if (
-            var.owner
-            and (walk_past_rvs or not isinstance(var.owner.op, RandomVariable))
-            and (var not in stop_at_vars)
-        ):
-            new_vars.extend(reversed(var.owner.inputs))
-
-        return new_vars
-
-    yield from walk(graphs, expand, False)
-
-
-def replace_rvs_in_graphs(
-    graphs: Iterable[TensorVariable],
-    replacement_fn: Callable[[TensorVariable], Dict[TensorVariable, TensorVariable]],
-    initial_replacements: Optional[Dict[TensorVariable, TensorVariable]] = None,
-    **kwargs,
-) -> Tuple[TensorVariable, Dict[TensorVariable, TensorVariable]]:
-    """Replace random variables in graphs
-
-    This will *not* recompute test values.
-
-    Parameters
-    ==========
-    graphs
-        The graphs in which random variables are to be replaced.
-
-    Returns
-    =======
-    Tuple containing the transformed graphs and a ``dict`` of the replacements
-    that were made.
-    """
-    replacements = {}
-    if initial_replacements:
-        replacements.update(initial_replacements)
-
-    def expand_replace(var):
-        new_nodes = []
-        if var.owner and isinstance(var.owner.op, RandomVariable):
-            new_nodes.extend(replacement_fn(var, replacements))
-        return new_nodes
-
-    for var in walk_model(graphs, expand_fn=expand_replace, **kwargs):
-        pass
-
-    if replacements:
-        inputs = [i for i in graph_inputs(graphs) if not isinstance(i, Constant)]
-        equiv = {k: k for k in replacements.keys()}
-        equiv = clone_get_equiv(inputs, graphs, False, False, equiv)
-
-        fg = FunctionGraph(
-            [equiv[i] for i in inputs],
-            [equiv[o] for o in graphs],
-            clone=False,
-        )
-
-        fg.replace_all(replacements.items(), import_missing=True)
-
-        graphs = list(fg.outputs)
-
-    return graphs, replacements
-
-
-def rvs_to_value_vars(
-    graphs: Iterable[TensorVariable],
-    apply_transforms: bool = False,
-    initial_replacements: Optional[Dict[TensorVariable, TensorVariable]] = None,
-    **kwargs,
-) -> Tuple[TensorVariable, Dict[TensorVariable, TensorVariable]]:
-    """Replace random variables in graphs with their value variables.
-
-    This will *not* recompute test values in the resulting graphs.
-
-    Parameters
-    ==========
-    graphs
-        The graphs in which to perform the replacements.
-    apply_transforms
-        If ``True``, apply each value variable's transform.
-    initial_replacements
-        A ``dict`` containing the initial replacements to be made.
-
-    """
-
-    def transform_replacements(var, replacements):
-        rv_var, rv_value_var = extract_rv_and_value_vars(var)
-
-        if rv_value_var is None:
-            return []
-
-        transform = getattr(rv_value_var.tag, "transform", None)
-
-        if transform is None or not apply_transforms:
-            replacements[var] = rv_value_var
-            return []
-
-        trans_rv_value = transform.backward(rv_var, rv_value_var)
-        replacements[var] = trans_rv_value
-
-        return [trans_rv_value]
-
-    return replace_rvs_in_graphs(graphs, transform_replacements, initial_replacements, **kwargs)
-
-
 def inputvars(a):
     """
     Get the inputs into a aesara variables
diff --git a/pymc3/tests/test_aesaraf.py b/pymc3/tests/test_aesaraf.py
index f13c5d6500..535f3cefaa 100644
--- a/pymc3/tests/test_aesaraf.py
+++ b/pymc3/tests/test_aesaraf.py
@@ -23,23 +23,13 @@
 import pytest
 import scipy.sparse as sps
 
-from aesara.graph.basic import Variable
-from aesara.tensor.random.basic import normal, uniform
-from aesara.tensor.random.op import RandomVariable
 from aesara.tensor.subtensor import AdvancedIncSubtensor, AdvancedIncSubtensor1
 from aesara.tensor.type import TensorType
 from aesara.tensor.var import TensorVariable
 
 import pymc3 as pm
 
-from pymc3.aesaraf import (
-    _conversion_map,
-    extract_obs_data,
-    pandas_to_array,
-    rvs_to_value_vars,
-    take_along_axis,
-    walk_model,
-)
+from pymc3.aesaraf import _conversion_map, extract_obs_data, take_along_axis
 from pymc3.vartypes import int_types
 
 FLOATX = str(aesara.config.floatX)
@@ -276,10 +266,10 @@ def test_dtype_failure(self):
 def test_extract_obs_data():
 
     with pytest.raises(TypeError):
-        extract_obs_data(at.matrix())
+        extract_obs_data(aet.matrix())
 
     data = np.random.normal(size=(2, 3))
-    data_at = at.as_tensor(data)
+    data_at = aet.as_tensor(data)
     mask = np.random.binomial(1, 0.5, size=(2, 3)).astype(bool)
 
     for val_at in (data_at, aesara.shared(data)):
@@ -291,8 +281,8 @@ def test_extract_obs_data():
     # AdvancedIncSubtensor check
     data_m = np.ma.MaskedArray(data, mask)
     missing_values = data_at.type()[mask]
-    constant = at.as_tensor(data_m.filled())
-    z_at = at.set_subtensor(constant[mask.nonzero()], missing_values)
+    constant = aet.as_tensor(data_m.filled())
+    z_at = aet.set_subtensor(constant[mask.nonzero()], missing_values)
 
     assert isinstance(z_at.owner.op, AdvancedIncSubtensor)
 
@@ -303,13 +293,13 @@ def test_extract_obs_data():
 
     # AdvancedIncSubtensor1 check
     data = np.random.normal(size=(3,))
-    data_at = at.as_tensor(data)
+    data_at = aet.as_tensor(data)
     mask = np.random.binomial(1, 0.5, size=(3,)).astype(bool)
 
     data_m = np.ma.MaskedArray(data, mask)
     missing_values = data_at.type()[mask]
-    constant = at.as_tensor(data_m.filled())
-    z_at = at.set_subtensor(constant[mask.nonzero()], missing_values)
+    constant = aet.as_tensor(data_m.filled())
+    z_at = aet.set_subtensor(constant[mask.nonzero()], missing_values)
 
     assert isinstance(z_at.owner.op, AdvancedIncSubtensor1)
 
@@ -317,159 +307,3 @@ def test_extract_obs_data():
 
     assert isinstance(res, np.ndarray)
     assert np.ma.allequal(res, data_m)
-
-
-@pytest.mark.parametrize("input_dtype", ["int32", "int64", "float32", "float64"])
-def test_pandas_to_array(input_dtype):
-    """
-    Ensure that pandas_to_array returns the dense array, masked array,
-    graph variable, TensorVariable, or sparse matrix as appropriate.
-    """
-    # Create the various inputs to the function
-    sparse_input = sps.csr_matrix(np.eye(3)).astype(input_dtype)
-    dense_input = np.arange(9).reshape((3, 3)).astype(input_dtype)
-
-    input_name = "input_variable"
-    aesara_graph_input = at.as_tensor(dense_input, name=input_name)
-    pandas_input = pd.DataFrame(dense_input)
-
-    # All the even numbers are replaced with NaN
-    missing_numpy_input = np.array([[np.nan, 1, np.nan], [3, np.nan, 5], [np.nan, 7, np.nan]])
-    missing_pandas_input = pd.DataFrame(missing_numpy_input)
-    masked_array_input = ma.array(dense_input, mask=(np.mod(dense_input, 2) == 0))
-
-    # Create a generator object. Apparently the generator object needs to
-    # yield numpy arrays.
-    square_generator = (np.array([i ** 2], dtype=int) for i in range(100))
-
-    # Alias the function to be tested
-    func = pandas_to_array
-
-    #####
-    # Perform the various tests
-    #####
-    # Check function behavior with dense arrays and pandas dataframes
-    # without missing values
-    for input_value in [dense_input, pandas_input]:
-        func_output = func(input_value)
-        assert isinstance(func_output, np.ndarray)
-        assert func_output.shape == input_value.shape
-        npt.assert_allclose(func_output, dense_input)
-
-    # Check function behavior with sparse matrix inputs
-    sparse_output = func(sparse_input)
-    assert sps.issparse(sparse_output)
-    assert sparse_output.shape == sparse_input.shape
-    npt.assert_allclose(sparse_output.toarray(), sparse_input.toarray())
-
-    # Check function behavior when using masked array inputs and pandas
-    # objects with missing data
-    for input_value in [missing_numpy_input, masked_array_input, missing_pandas_input]:
-        func_output = func(input_value)
-        assert isinstance(func_output, ma.core.MaskedArray)
-        assert func_output.shape == input_value.shape
-        npt.assert_allclose(func_output, masked_array_input)
-
-    # Check function behavior with Aesara graph variable
-    aesara_output = func(aesara_graph_input)
-    assert isinstance(aesara_output, Variable)
-    npt.assert_allclose(aesara_output.eval(), aesara_graph_input.eval())
-    intX = pm.aesaraf._conversion_map[aesara.config.floatX]
-    if dense_input.dtype == intX or dense_input.dtype == aesara.config.floatX:
-        assert aesara_output.owner is None  # func should not have added new nodes
-        assert aesara_output.name == input_name
-    else:
-        assert aesara_output.owner is not None  # func should have casted
-        assert aesara_output.owner.inputs[0].name == input_name
-
-    if "float" in input_dtype:
-        assert aesara_output.dtype == aesara.config.floatX
-    else:
-        assert aesara_output.dtype == intX
-
-    # Check function behavior with generator data
-    generator_output = func(square_generator)
-
-    # Output is wrapped with `pm.floatX`, and this unwraps
-    wrapped = generator_output.owner.inputs[0]
-    # Make sure the returned object has .set_gen and .set_default methods
-    assert hasattr(wrapped, "set_gen")
-    assert hasattr(wrapped, "set_default")
-    # Make sure the returned object is a Aesara TensorVariable
-    assert isinstance(wrapped, TensorVariable)
-
-
-def test_walk_model():
-    d = at.vector("d")
-    b = at.vector("b")
-    c = uniform(0.0, d)
-    c.name = "c"
-    e = at.log(c)
-    a = normal(e, b)
-    a.name = "a"
-
-    test_graph = at.exp(a + 1)
-    res = list(walk_model((test_graph,)))
-    assert a in res
-    assert c not in res
-
-    res = list(walk_model((test_graph,), walk_past_rvs=True))
-    assert a in res
-    assert c in res
-
-    res = list(walk_model((test_graph,), walk_past_rvs=True, stop_at_vars={e}))
-    assert a in res
-    assert c not in res
-
-
-def test_rvs_to_value_vars():
-
-    with pm.Model() as m:
-        a = pm.Uniform("a", 0.0, 1.0)
-        b = pm.Uniform("b", 0, a + 1.0)
-        c = pm.Normal("c")
-        d = at.log(c + b) + 2.0
-
-    a_value_var = m.rvs_to_values[a]
-    assert a_value_var.tag.transform
-
-    b_value_var = m.rvs_to_values[b]
-    c_value_var = m.rvs_to_values[c]
-
-    (res,), replaced = rvs_to_value_vars((d,))
-
-    assert res.owner.op == at.add
-    log_output = res.owner.inputs[0]
-    assert log_output.owner.op == at.log
-    log_add_output = res.owner.inputs[0].owner.inputs[0]
-    assert log_add_output.owner.op == at.add
-    c_output = log_add_output.owner.inputs[0]
-
-    # We make sure that the random variables were replaced
-    # with their value variables
-    assert c_output == c_value_var
-    b_output = log_add_output.owner.inputs[1]
-    assert b_output == b_value_var
-
-    res_ancestors = list(walk_model((res,), walk_past_rvs=True))
-    res_rv_ancestors = [
-        v for v in res_ancestors if v.owner and isinstance(v.owner.op, RandomVariable)
-    ]
-
-    # There shouldn't be any `RandomVariable`s in the resulting graph
-    assert len(res_rv_ancestors) == 0
-    assert b_value_var in res_ancestors
-    assert c_value_var in res_ancestors
-    assert a_value_var not in res_ancestors
-
-    (res,), replaced = rvs_to_value_vars((d,), apply_transforms=True)
-
-    res_ancestors = list(walk_model((res,), walk_past_rvs=True))
-    res_rv_ancestors = [
-        v for v in res_ancestors if v.owner and isinstance(v.owner.op, RandomVariable)
-    ]
-
-    assert len(res_rv_ancestors) == 0
-    assert a_value_var in res_ancestors
-    assert b_value_var in res_ancestors
-    assert c_value_var in res_ancestors

From ab31fc84731bf17374ce7760640d534872dd0247 Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Thu, 25 Mar 2021 20:09:16 -0500
Subject: [PATCH 33/44] Re-enable Arviz tests in pymc3.tests.test_sampling

---
 pymc3/tests/test_sampling.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/pymc3/tests/test_sampling.py b/pymc3/tests/test_sampling.py
index d857b4e544..4127b52136 100644
--- a/pymc3/tests/test_sampling.py
+++ b/pymc3/tests/test_sampling.py
@@ -19,7 +19,7 @@
 from typing import Tuple
 
 import aesara
-import aesara.tensor as at
+import aesara.tensor as aet
 import numpy as np
 import numpy.testing as npt
 import pytest
@@ -474,7 +474,6 @@ def test_normal_scalar(self):
             ppc = pm.sample_posterior_predictive(trace, size=5, var_names=["a"])
             assert ppc["a"].shape == (nchains * ndraws, 5)
 
-    @pytest.mark.xfail(reason="Arviz not refactored for v4")
     def test_normal_scalar_idata(self):
         nchains = 2
         ndraws = 500
@@ -482,12 +481,19 @@ def test_normal_scalar_idata(self):
             mu = pm.Normal("mu", 0.0, 1.0)
             a = pm.Normal("a", mu=mu, sigma=1, observed=0.0)
             trace = pm.sample(
-                draws=ndraws, chains=nchains, return_inferencedata=True, discard_tuned_samples=False
+                draws=ndraws,
+                chains=nchains,
+                return_inferencedata=False,
+                discard_tuned_samples=False,
             )
 
+        assert not isinstance(trace, InferenceData)
+
         with model:
             # test keep_size parameter and idata input
-            idata = az.from_pymc3(trace)
+            idata = pm.to_inference_data(trace)
+            assert isinstance(idata, InferenceData)
+
             ppc = pm.sample_posterior_predictive(idata, keep_size=True)
             assert ppc["a"].shape == (nchains, ndraws)
 
@@ -521,16 +527,19 @@ def test_normal_vector(self, caplog):
             assert "a" in ppc
             assert ppc["a"].shape == (10, 4, 2)
 
-    @pytest.mark.xfail(reason="Arviz not refactored for v4")
     def test_normal_vector_idata(self, caplog):
         with pm.Model() as model:
             mu = pm.Normal("mu", 0.0, 1.0)
             a = pm.Normal("a", mu=mu, sigma=1, observed=np.array([0.5, 0.2]))
             trace = pm.sample(return_inferencedata=False)
 
+        assert not isinstance(trace, InferenceData)
+
         with model:
             # test keep_size parameter with inference data as input...
-            idata = az.from_pymc3(trace)
+            idata = pm.to_inference_data(trace)
+            assert isinstance(idata, InferenceData)
+
             ppc = pm.sample_posterior_predictive(idata, keep_size=True)
             assert ppc["a"].shape == (trace.nchains, len(trace), 2)
 
@@ -1071,7 +1080,6 @@ def test_point_list_arg_bug_spp(self, point_list_arg_bug_fixture):
         with pmodel:
             pp = pm.sample_posterior_predictive([trace[15]], var_names=["d"])
 
-    @pytest.mark.xfail(reason="Arviz not refactored for v4")
     def test_sample_from_xarray_prior(self, point_list_arg_bug_fixture):
         pmodel, trace = point_list_arg_bug_fixture
 
@@ -1083,7 +1091,6 @@ def test_sample_from_xarray_prior(self, point_list_arg_bug_fixture):
         with pmodel:
             pp = pm.sample_posterior_predictive(idat.prior, var_names=["d"])
 
-    @pytest.mark.xfail(reason="Arviz not refactored for v4")
     def test_sample_from_xarray_posterior(self, point_list_arg_bug_fixture):
         pmodel, trace = point_list_arg_bug_fixture
         idat = pm.to_inference_data(trace)

From a3ad9d1540a251335d8216cbabdbf6429393ecdc Mon Sep 17 00:00:00 2001
From: Ricardo <ricardo.vieira1994@gmail.com>
Date: Tue, 16 Mar 2021 18:43:29 +0100
Subject: [PATCH 34/44] Fix HalfCauchy/HalfCauchyRV parameterization

---
 pymc3/distributions/continuous.py | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/pymc3/distributions/continuous.py b/pymc3/distributions/continuous.py
index 55ac15d625..78069d41b5 100644
--- a/pymc3/distributions/continuous.py
+++ b/pymc3/distributions/continuous.py
@@ -2267,14 +2267,10 @@ class HalfCauchy(PositiveContinuous):
     @classmethod
     def dist(cls, beta, *args, **kwargs):
         beta = aet.as_tensor_variable(floatX(beta))
-
-        # mode = aet.as_tensor_variable(0)
-        # median = beta
-
         assert_negative_support(beta, "beta", "HalfCauchy")
-        return super().dist([beta], **kwargs)
+        return super().dist([0.0, beta], **kwargs)
 
-    def logp(value, beta, alpha):
+    def logp(value, loc, beta):
         """
         Calculate log-probability of HalfCauchy distribution at specified value.
 
@@ -2289,12 +2285,12 @@ def logp(value, beta, alpha):
         TensorVariable
         """
         return bound(
-            aet.log(2) - aet.log(np.pi) - aet.log(beta) - aet.log1p((value / beta) ** 2),
-            value >= 0,
+            aet.log(2) - aet.log(np.pi) - aet.log(beta) - aet.log1p(((value - loc) / beta) ** 2),
+            value >= loc,
             beta > 0,
         )
 
-    def logcdf(value, beta, alpha):
+    def logcdf(value, loc, beta):
         """
         Compute the log of the cumulative distribution function for HalfCauchy distribution
         at the specified value.
@@ -2310,8 +2306,8 @@ def logcdf(value, beta, alpha):
         TensorVariable
         """
         return bound(
-            aet.log(2 * aet.arctan(value / beta) / np.pi),
-            0 <= value,
+            aet.log(2 * aet.arctan((value - loc) / beta) / np.pi),
+            loc <= value,
             0 < beta,
         )
 

From 15a29b2e9c6cd63846ba8060e6a45be816bd560d Mon Sep 17 00:00:00 2001
From: Ricardo <ricardo.vieira1994@gmail.com>
Date: Wed, 17 Mar 2021 16:05:01 +0100
Subject: [PATCH 35/44] Fix HalfNormal/HalfNormalRV parameterization

---
 pymc3/distributions/continuous.py | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/pymc3/distributions/continuous.py b/pymc3/distributions/continuous.py
index 78069d41b5..456180041e 100644
--- a/pymc3/distributions/continuous.py
+++ b/pymc3/distributions/continuous.py
@@ -788,18 +788,12 @@ def dist(cls, sigma=None, tau=None, sd=None, *args, **kwargs):
 
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
 
-        # sigma = sd = sigma = aet.as_tensor_variable(sigma)
-        # tau = tau = aet.as_tensor_variable(tau)
-
-        # mean = aet.sqrt(2 / (np.pi * tau))
-        # variance = (1.0 - 2 / np.pi) / tau
-
         assert_negative_support(tau, "tau", "HalfNormal")
         assert_negative_support(sigma, "sigma", "HalfNormal")
 
-        return super().dist([sigma, tau], **kwargs)
+        return super().dist([0.0, sigma], **kwargs)
 
-    def logp(value, sigma, tau):
+    def logp(value, loc, sigma):
         """
         Calculate log-probability of HalfNormal distribution at specified value.
 
@@ -813,14 +807,16 @@ def logp(value, sigma, tau):
         -------
         TensorVariable
         """
+        tau, sigma = get_tau_sigma(tau=None, sigma=sigma)
+
         return bound(
-            -0.5 * tau * value ** 2 + 0.5 * aet.log(tau * 2.0 / np.pi),
-            value >= 0,
+            -0.5 * tau * (value - loc) ** 2 + 0.5 * aet.log(tau * 2.0 / np.pi),
+            value >= loc,
             tau > 0,
             sigma > 0,
         )
 
-    def logcdf(value, sigma, tau):
+    def logcdf(value, loc, sigma):
         """
         Compute the log of the cumulative distribution function for HalfNormal distribution
         at the specified value.
@@ -835,10 +831,10 @@ def logcdf(value, sigma, tau):
         -------
         TensorVariable
         """
-        z = zvalue(value, mu=0, sigma=sigma)
+        z = zvalue(value, mu=loc, sigma=sigma)
         return bound(
             aet.log1p(-aet.erfc(z / aet.sqrt(2.0))),
-            0 <= value,
+            loc <= value,
             0 < sigma,
         )
 

From a5d86095147771c629c6b53b87b55e42e94f5b68 Mon Sep 17 00:00:00 2001
From: Ricardo <ricardo.vieira1994@gmail.com>
Date: Wed, 17 Mar 2021 14:26:08 +0100
Subject: [PATCH 36/44] Refactor Beta to use custom rng_fn clipped_beta_rv

---
 pymc3/distributions/continuous.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pymc3/distributions/continuous.py b/pymc3/distributions/continuous.py
index 456180041e..6083198731 100644
--- a/pymc3/distributions/continuous.py
+++ b/pymc3/distributions/continuous.py
@@ -23,7 +23,7 @@
 
 from aesara.assert_op import Assert
 from aesara.tensor.random.basic import (
-    beta,
+    BetaRV,
     cauchy,
     exponential,
     gamma,
@@ -42,6 +42,7 @@
     SplineWrapper,
     betaln,
     bound,
+    clipped_beta_rvs,
     gammaln,
     i0e,
     incomplete_beta,
@@ -1151,9 +1152,6 @@ def dist(cls, alpha=None, beta=None, mu=None, sigma=None, sd=None, *args, **kwar
         alpha = aet.as_tensor_variable(floatX(alpha))
         beta = aet.as_tensor_variable(floatX(beta))
 
-        # mean = alpha / (alpha + beta)
-        # variance = (alpha * beta) / ((alpha + beta) ** 2 * (alpha + beta + 1))
-
         assert_negative_support(alpha, "alpha", "Beta")
         assert_negative_support(beta, "beta", "Beta")
 

From 3885d65fbb98f851514bc9c30313ee747918674d Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Fri, 26 Mar 2021 19:05:20 -0500
Subject: [PATCH 37/44] Re-enable v4 xfails in pymc3.distributions.dist_math

---
 pymc3/tests/test_dist_math.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/pymc3/tests/test_dist_math.py b/pymc3/tests/test_dist_math.py
index d4285f06c6..7cbcafcdfd 100644
--- a/pymc3/tests/test_dist_math.py
+++ b/pymc3/tests/test_dist_math.py
@@ -125,7 +125,6 @@ def logp(value, n, p):
         )
 
 
-@pytest.mark.xfail(reason="This test relies on the deprecated Distribution interface")
 def test_multinomial_bound():
 
     x = np.array([1, 5])
@@ -144,7 +143,6 @@ def test_multinomial_bound():
     )
 
 
-@pytest.mark.xfail(reason="MvNormal not implemented")
 class TestMvNormalLogp:
     def test_logp(self):
         np.random.seed(42)
@@ -187,21 +185,21 @@ def func(chol_vec, delta):
 
     @aesara.config.change_flags(compute_test_value="ignore")
     def test_hessian(self):
-        chol_vec = at.vector("chol_vec")
+        chol_vec = aet.vector("chol_vec")
         chol_vec.tag.test_value = floatX(np.array([0.1, 2, 3]))
-        chol = at.stack(
+        chol = aet.stack(
             [
                 at.stack([at.exp(0.1 * chol_vec[0]), 0]),
                 at.stack([chol_vec[1], 2 * at.exp(chol_vec[2])]),
             ]
         )
-        cov = at.dot(chol, chol.T)
-        delta = at.matrix("delta")
+        cov = aet.dot(chol, chol.T)
+        delta = aet.matrix("delta")
         delta.tag.test_value = floatX(np.ones((5, 2)))
         logp = MvNormalLogp()(cov, delta)
-        g_cov, g_delta = at.grad(logp, [cov, delta])
+        g_cov, g_delta = aet.grad(logp, [cov, delta])
         # TODO: What's the test?  Something needs to be asserted.
-        at.grad(g_delta.sum() + g_cov.sum(), [delta, cov])
+        aet.grad(g_delta.sum() + g_cov.sum(), [delta, cov])
 
 
 class TestSplineWrapper:

From 2d16aa51ef7fb46ab1fcb143ae17cb56df0e70c1 Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Sat, 27 Mar 2021 02:26:42 -0500
Subject: [PATCH 38/44] Change shape to size in pymc3.tests.test_step

---
 pymc3/tests/test_step.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pymc3/tests/test_step.py b/pymc3/tests/test_step.py
index fd02139879..350af8473e 100644
--- a/pymc3/tests/test_step.py
+++ b/pymc3/tests/test_step.py
@@ -979,9 +979,9 @@ def test_bad_init_parallel(self):
 
     def test_linalg(self, caplog):
         with Model():
-            a = Normal("a", size=2, testval=floatX(np.zeros(2)))
-            a = at.switch(a > 0, np.inf, a)
-            b = at.slinalg.solve(floatX(np.eye(2)), a)
+            a = Normal("a", size=2)
+            a = aet.switch(a > 0, np.inf, a)
+            b = aet.slinalg.solve(floatX(np.eye(2)), a)
             Normal("c", mu=b, size=2, testval=floatX(np.r_[0.0, 0.0]))
             caplog.clear()
             trace = sample(20, init=None, tune=5, chains=2)

From 4231ee211b130a794154c486d6bd1b2b4559c308 Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Sat, 27 Mar 2021 02:29:27 -0500
Subject: [PATCH 39/44] Remove incorrect size for multivariate distributions in
 pymc3.tests.models

---
 pymc3/tests/models.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/pymc3/tests/models.py b/pymc3/tests/models.py
index b2f98ef87b..3cd66e5098 100644
--- a/pymc3/tests/models.py
+++ b/pymc3/tests/models.py
@@ -106,7 +106,6 @@ def mv_simple():
             "x",
             aet.constant(mu),
             tau=aet.constant(tau),
-            size=3,
             testval=floatX_array([0.1, 1.0, 0.8]),
         )
     H = tau
@@ -123,7 +122,6 @@ def mv_simple_coarse():
             "x",
             aet.constant(mu),
             tau=aet.constant(tau),
-            size=3,
             testval=floatX_array([0.1, 1.0, 0.8]),
         )
     H = tau
@@ -140,7 +138,6 @@ def mv_simple_very_coarse():
             "x",
             aet.constant(mu),
             tau=aet.constant(tau),
-            size=3,
             testval=floatX_array([0.1, 1.0, 0.8]),
         )
     H = tau
@@ -153,7 +150,7 @@ def mv_simple_discrete():
     n = 5
     p = floatX_array([0.15, 0.85])
     with pm.Model() as model:
-        pm.Multinomial("x", n, aet.constant(p), size=d, testval=np.array([1, 4]))
+        pm.Multinomial("x", n, aet.constant(p), testval=np.array([1, 4]))
         mu = n * p
         # covariance matrix
         C = np.zeros((d, d))
@@ -187,7 +184,7 @@ def mv_prior_simple():
 
     with pm.Model() as model:
         x = pm.Flat("x", size=n)
-        x_obs = pm.MvNormal("x_obs", observed=obs, mu=x, cov=noise * np.eye(n), size=n)
+        x_obs = pm.MvNormal("x_obs", observed=obs, mu=x, cov=noise * np.eye(n))
 
     return model.initial_point, model, (K, L, mu_post, std_post, noise)
 

From ab41e0df7a476646441760a861f8d5e436b7442a Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Fri, 26 Mar 2021 20:18:00 -0500
Subject: [PATCH 40/44] Fix extra_vars in call to ValueGradFunction from Model

---
 pymc3/model.py            | 19 ++++++++++++-------
 pymc3/tests/test_model.py |  2 +-
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/pymc3/model.py b/pymc3/model.py
index d318db0dcb..bcd4f8dea8 100644
--- a/pymc3/model.py
+++ b/pymc3/model.py
@@ -377,10 +377,10 @@ def __init__(
         compute_grads=True,
         **kwargs,
     ):
-        if extra_vars is None:
-            extra_vars = []
+        if extra_vars_and_values is None:
+            extra_vars_and_values = {}
 
-        names = [arg.name for arg in grad_vars + extra_vars]
+        names = [arg.name for arg in grad_vars + list(extra_vars_and_values.keys())]
         if any(name is None for name in names):
             raise ValueError("Arguments must be named.")
         if len(set(names)) != len(names):
@@ -421,8 +421,8 @@ def __init__(
 
         givens = []
         self._extra_vars_shared = {}
-        for var in extra_vars:
-            shared = aesara.shared(var.tag.test_value, var.name + "_shared__")
+        for var, value in extra_vars_and_values.items():
+            shared = aesara.shared(value, var.name + "_shared__")
             self._extra_vars_shared[var.name] = shared
             givens.append((var, shared))
 
@@ -694,8 +694,13 @@ def logp_dlogp_function(self, grad_vars=None, tempered=False, **kwargs):
             costs = [self.logpt]
 
         input_vars = {i for i in graph_inputs(costs) if not isinstance(i, Constant)}
-        extra_vars = [var for var in self.free_RVs if var in input_vars]
-        return ValueGradFunction(costs, grad_vars, extra_vars, **kwargs)
+        extra_vars = [getattr(var.tag, "value_var", var) for var in self.free_RVs]
+        extra_vars_and_values = {
+            var: self.test_point[var.name]
+            for var in extra_vars
+            if var in input_vars and var not in grad_vars
+        }
+        return ValueGradFunction(costs, grad_vars, extra_vars_and_values, **kwargs)
 
     @property
     def logpt(self):
diff --git a/pymc3/tests/test_model.py b/pymc3/tests/test_model.py
index db239d5ff7..de001d2010 100644
--- a/pymc3/tests/test_model.py
+++ b/pymc3/tests/test_model.py
@@ -223,7 +223,7 @@ class TestValueGradFunction(unittest.TestCase):
     def test_no_extra(self):
         a = at.vector("a")
         a.tag.test_value = np.zeros(3, dtype=a.dtype)
-        f_grad = ValueGradFunction([a.sum()], [a], [], mode="FAST_COMPILE")
+        f_grad = ValueGradFunction([a.sum()], [a], {}, mode="FAST_COMPILE")
         assert f_grad._extra_vars == []
 
     def test_invalid_type(self):

From c6f2f31d02cc4c7fd470fb0358247d808be0ac15 Mon Sep 17 00:00:00 2001
From: drabbit17 <matteo.pallini17@gmail.com>
Date: Sun, 28 Mar 2021 19:24:35 +0100
Subject: [PATCH 41/44] Update tests following distributions refactoring

The distributions refactoring moves the random variable sampling to
aesara. This relies on numpy and scipy random variables implementation.
So, now the only thing we care about testing is that the parametrization
on the PyMC side is sendible given the one on the Aesara side
(effectively the numpy/scipy one)

More details can be found on issue #4554
https://github.com/pymc-devs/pymc3/issues/4554
---
 pymc3/tests/test_distributions_random.py | 138 ++++++++++++-----------
 1 file changed, 73 insertions(+), 65 deletions(-)

diff --git a/pymc3/tests/test_distributions_random.py b/pymc3/tests/test_distributions_random.py
index 0123a3b5c8..a95343f436 100644
--- a/pymc3/tests/test_distributions_random.py
+++ b/pymc3/tests/test_distributions_random.py
@@ -23,13 +23,14 @@
 import pytest
 import scipy.stats as st
 
+from numpy.testing import assert_almost_equal
 from scipy import linalg
 from scipy.special import expit
 
 import pymc3 as pm
 
-from pymc3.aesaraf import change_rv_size, floatX, intX
-from pymc3.distributions.dist_math import clipped_beta_rvs
+from pymc3.aesaraf import floatX, intX
+from pymc3.distributions import change_rv_size
 from pymc3.distributions.shape_utils import to_tuple
 from pymc3.exceptions import ShapeError
 from pymc3.tests.helpers import SeededTest
@@ -540,6 +541,76 @@ def test_dirichlet_random_shape(self, shape, size):
         assert pm.Dirichlet.dist(a=np.ones(shape)).random(size=size).shape == out_shape
 
 
+class TestCorrectParametrizationMappingPymcToScipy(SeededTest):
+    @staticmethod
+    def get_inputs_from_apply_node_outputs(outputs):
+        parents = outputs.get_parents()
+        if not parents:
+            raise Exception("Parent Apply node missing for output")
+        # I am assuming there will always only be 1 Apply parent node in this context
+        return parents[0].inputs
+
+    def test_pymc_params_match_rv_ones(
+        self, pymc_params, expected_aesara_params, pymc_dist, decimal=6
+    ):
+        pymc_dist_output = pymc_dist.dist(**dict(pymc_params))
+        aesera_dist_inputs = self.get_inputs_from_apply_node_outputs(pymc_dist_output)[3:]
+        assert len(expected_aesara_params) == len(aesera_dist_inputs)
+        for (expected_name, expected_value), actual_variable in zip(
+            expected_aesara_params, aesera_dist_inputs
+        ):
+            assert_almost_equal(expected_value, actual_variable.eval(), decimal=decimal)
+
+    def test_normal(self):
+        params = [("mu", 5.0), ("sigma", 10.0)]
+        self.test_pymc_params_match_rv_ones(params, params, pm.Normal)
+
+    def test_uniform(self):
+        params = [("lower", 0.5), ("upper", 1.5)]
+        self.test_pymc_params_match_rv_ones(params, params, pm.Uniform)
+
+    def test_half_normal(self):
+        params, expected_aesara_params = [("sigma", 10.0)], [("mean", 0), ("sigma", 10.0)]
+        self.test_pymc_params_match_rv_ones(params, expected_aesara_params, pm.HalfNormal)
+
+    def test_beta_alpha_beta(self):
+        params = [("alpha", 2.0), ("beta", 5.0)]
+        self.test_pymc_params_match_rv_ones(params, params, pm.Beta)
+
+    def test_beta_mu_sigma(self):
+        params = [("mu", 2.0), ("sigma", 5.0)]
+        expected_alpha, expected_beta = pm.Beta.get_alpha_beta(mu=params[0][1], sigma=params[1][1])
+        expected_params = [("alpha", expected_alpha), ("beta", expected_beta)]
+        self.test_pymc_params_match_rv_ones(params, expected_params, pm.Beta)
+
+    @pytest.mark.skip(reason="Expected to fail due to bug")
+    def test_exponential(self):
+        params = [("lam", 10.0)]
+        expected_params = [("lam", 1 / params[0][1])]
+        self.test_pymc_params_match_rv_ones(params, expected_params, pm.Exponential)
+
+    def test_cauchy(self):
+        params = [("alpha", 2.0), ("beta", 5.0)]
+        self.test_pymc_params_match_rv_ones(params, params, pm.Cauchy)
+
+    def test_half_cauchy(self):
+        params = [("alpha", 2.0), ("beta", 5.0)]
+        self.test_pymc_params_match_rv_ones(params, params, pm.HalfCauchy)
+
+    @pytest.mark.skip(reason="Expected to fail due to bug")
+    def test_gamma_alpha_beta(self):
+        params = [("alpha", 2.0), ("beta", 5.0)]
+        expected_params = [("alpha", params[0][1]), ("beta", 1 / params[1][1])]
+        self.test_pymc_params_match_rv_ones(params, expected_params, pm.Gamma)
+
+    @pytest.mark.skip(reason="Expected to fail due to bug")
+    def test_gamma_mu_sigma(self):
+        params = [("mu", 2.0), ("sigma", 5.0)]
+        expected_alpha, expected_beta = pm.Gamma.get_alpha_beta(mu=params[0][1], sigma=params[1][1])
+        expected_params = [("alpha", expected_alpha), ("beta", 1 / expected_beta)]
+        self.test_pymc_params_match_rv_ones(params, expected_params, pm.Gamma)
+
+
 class TestScalarParameterSamples(SeededTest):
     @pytest.mark.xfail(reason="This distribution has not been refactored for v4")
     def test_bounded(self):
@@ -551,20 +622,6 @@ def ref_rand(size, tau):
 
         pymc3_random(BoundedNormal, {"tau": Rplus}, ref_rand=ref_rand)
 
-    @pytest.mark.skip(reason="This test is covered by Aesara")
-    def test_uniform(self):
-        def ref_rand(size, lower, upper):
-            return st.uniform.rvs(size=size, loc=lower, scale=upper - lower)
-
-        pymc3_random(pm.Uniform, {"lower": -Rplus, "upper": Rplus}, ref_rand=ref_rand)
-
-    @pytest.mark.skip(reason="This test is covered by Aesara")
-    def test_normal(self):
-        def ref_rand(size, mu, sigma):
-            return st.norm.rvs(size=size, loc=mu, scale=sigma)
-
-        pymc3_random(pm.Normal, {"mu": R, "sigma": Rplus}, ref_rand=ref_rand)
-
     @pytest.mark.xfail(reason="This distribution has not been refactored for v4")
     def test_truncated_normal(self):
         def ref_rand(size, mu, sigma, lower, upper):
@@ -603,13 +660,6 @@ def ref_rand(size, alpha, mu, sigma):
 
         pymc3_random(pm.SkewNormal, {"mu": R, "sigma": Rplus, "alpha": R}, ref_rand=ref_rand)
 
-    @pytest.mark.skip(reason="This test is covered by Aesara")
-    def test_half_normal(self):
-        def ref_rand(size, tau):
-            return st.halfnorm.rvs(size=size, loc=0, scale=tau ** -0.5)
-
-        pymc3_random(pm.HalfNormal, {"tau": Rplus}, ref_rand=ref_rand)
-
     @pytest.mark.xfail(reason="This distribution has not been refactored for v4")
     def test_wald(self):
         # Cannot do anything too exciting as scipy wald is a
@@ -623,20 +673,6 @@ def ref_rand(size, mu, lam, alpha):
             ref_rand=ref_rand,
         )
 
-    @pytest.mark.skip(reason="This test is covered by Aesara")
-    def test_beta(self):
-        def ref_rand(size, alpha, beta):
-            return clipped_beta_rvs(a=alpha, b=beta, size=size)
-
-        pymc3_random(pm.Beta, {"alpha": Rplus, "beta": Rplus}, ref_rand=ref_rand)
-
-    @pytest.mark.skip(reason="This test is covered by Aesara")
-    def test_exponential(self):
-        def ref_rand(size, lam):
-            return nr.exponential(scale=1.0 / lam, size=size)
-
-        pymc3_random(pm.Exponential, {"lam": Rplus}, ref_rand=ref_rand)
-
     @pytest.mark.xfail(reason="This distribution has not been refactored for v4")
     def test_laplace(self):
         def ref_rand(size, mu, b):
@@ -670,34 +706,6 @@ def ref_rand(size, nu, mu, lam):
 
         pymc3_random(pm.StudentT, {"nu": Rplus, "mu": R, "lam": Rplus}, ref_rand=ref_rand)
 
-    @pytest.mark.skip(reason="This test is covered by Aesara")
-    def test_cauchy(self):
-        def ref_rand(size, alpha, beta):
-            return st.cauchy.rvs(alpha, beta, size=size)
-
-        pymc3_random(pm.Cauchy, {"alpha": R, "beta": Rplusbig}, ref_rand=ref_rand)
-
-    @pytest.mark.skip(reason="This test is covered by Aesara")
-    def test_half_cauchy(self):
-        def ref_rand(size, beta):
-            return st.halfcauchy.rvs(scale=beta, size=size)
-
-        pymc3_random(pm.HalfCauchy, {"beta": Rplusbig}, ref_rand=ref_rand)
-
-    @pytest.mark.skip(reason="This test is covered by Aesara")
-    def test_gamma_alpha_beta(self):
-        def ref_rand(size, alpha, beta):
-            return st.gamma.rvs(alpha, scale=1.0 / beta, size=size)
-
-        pymc3_random(pm.Gamma, {"alpha": Rplusbig, "beta": Rplusbig}, ref_rand=ref_rand)
-
-    @pytest.mark.skip(reason="This test is covered by Aesara")
-    def test_gamma_mu_sigma(self):
-        def ref_rand(size, mu, sigma):
-            return st.gamma.rvs(mu ** 2 / sigma ** 2, scale=sigma ** 2 / mu, size=size)
-
-        pymc3_random(pm.Gamma, {"mu": Rplusbig, "sigma": Rplusbig}, ref_rand=ref_rand)
-
     @pytest.mark.skip(reason="This test is covered by Aesara")
     def test_inverse_gamma(self):
         def ref_rand(size, alpha, beta):

From 199451dcd9857d067fc9beef5dc9927b04161ab3 Mon Sep 17 00:00:00 2001
From: drabbit17 <matteo.pallini17@gmail.com>
Date: Thu, 1 Apr 2021 19:54:24 +0100
Subject: [PATCH 42/44] Change tests for more refactored distributions.

More details on commit id 0773620b6f599423315035b97ef082ad32d98fd4
---
 pymc3/distributions/discrete.py          |  8 +-
 pymc3/tests/test_distributions_random.py | 94 ++++++++++++------------
 2 files changed, 51 insertions(+), 51 deletions(-)

diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py
index cb59ae36f9..4f6b3046a9 100644
--- a/pymc3/distributions/discrete.py
+++ b/pymc3/distributions/discrete.py
@@ -731,16 +731,16 @@ def NegBinom(a, m, x):
 
     @classmethod
     def dist(cls, mu=None, alpha=None, p=None, n=None, *args, **kwargs):
-        n, p = cls.get_mu_alpha(mu, alpha, p, n)
+        n, p = cls.get_n_p(mu, alpha, p, n)
         n = aet.as_tensor_variable(floatX(n))
         p = aet.as_tensor_variable(floatX(p))
         return super().dist([n, p], *args, **kwargs)
 
     @classmethod
-    def get_mu_alpha(cls, mu=None, alpha=None, p=None, n=None):
+    def get_n_p(cls, mu=None, alpha=None, p=None, n=None):
         if n is None:
             if alpha is not None:
-                n = aet.as_tensor_variable(floatX(alpha))
+                n = alpha
             else:
                 raise ValueError("Incompatible parametrization. Must specify either alpha or n.")
         elif alpha is not None:
@@ -748,7 +748,7 @@ def get_mu_alpha(cls, mu=None, alpha=None, p=None, n=None):
 
         if p is None:
             if mu is not None:
-                mu = aet.as_tensor_variable(floatX(mu))
+                mu = mu
                 p = n / (mu + n)
             else:
                 raise ValueError("Incompatible parametrization. Must specify either mu or p.")
diff --git a/pymc3/tests/test_distributions_random.py b/pymc3/tests/test_distributions_random.py
index a95343f436..41d9f987b1 100644
--- a/pymc3/tests/test_distributions_random.py
+++ b/pymc3/tests/test_distributions_random.py
@@ -550,9 +550,7 @@ def get_inputs_from_apply_node_outputs(outputs):
         # I am assuming there will always only be 1 Apply parent node in this context
         return parents[0].inputs
 
-    def test_pymc_params_match_rv_ones(
-        self, pymc_params, expected_aesara_params, pymc_dist, decimal=6
-    ):
+    def _pymc_params_match_rv_ones(self, pymc_params, expected_aesara_params, pymc_dist, decimal=6):
         pymc_dist_output = pymc_dist.dist(**dict(pymc_params))
         aesera_dist_inputs = self.get_inputs_from_apply_node_outputs(pymc_dist_output)[3:]
         assert len(expected_aesara_params) == len(aesera_dist_inputs)
@@ -563,52 +561,88 @@ def test_pymc_params_match_rv_ones(
 
     def test_normal(self):
         params = [("mu", 5.0), ("sigma", 10.0)]
-        self.test_pymc_params_match_rv_ones(params, params, pm.Normal)
+        self._pymc_params_match_rv_ones(params, params, pm.Normal)
 
     def test_uniform(self):
         params = [("lower", 0.5), ("upper", 1.5)]
-        self.test_pymc_params_match_rv_ones(params, params, pm.Uniform)
+        self._pymc_params_match_rv_ones(params, params, pm.Uniform)
 
     def test_half_normal(self):
         params, expected_aesara_params = [("sigma", 10.0)], [("mean", 0), ("sigma", 10.0)]
-        self.test_pymc_params_match_rv_ones(params, expected_aesara_params, pm.HalfNormal)
+        self._pymc_params_match_rv_ones(params, expected_aesara_params, pm.HalfNormal)
 
     def test_beta_alpha_beta(self):
         params = [("alpha", 2.0), ("beta", 5.0)]
-        self.test_pymc_params_match_rv_ones(params, params, pm.Beta)
+        self._pymc_params_match_rv_ones(params, params, pm.Beta)
 
     def test_beta_mu_sigma(self):
         params = [("mu", 2.0), ("sigma", 5.0)]
         expected_alpha, expected_beta = pm.Beta.get_alpha_beta(mu=params[0][1], sigma=params[1][1])
         expected_params = [("alpha", expected_alpha), ("beta", expected_beta)]
-        self.test_pymc_params_match_rv_ones(params, expected_params, pm.Beta)
+        self._pymc_params_match_rv_ones(params, expected_params, pm.Beta)
 
     @pytest.mark.skip(reason="Expected to fail due to bug")
     def test_exponential(self):
         params = [("lam", 10.0)]
         expected_params = [("lam", 1 / params[0][1])]
-        self.test_pymc_params_match_rv_ones(params, expected_params, pm.Exponential)
+        self._pymc_params_match_rv_ones(params, expected_params, pm.Exponential)
 
     def test_cauchy(self):
         params = [("alpha", 2.0), ("beta", 5.0)]
-        self.test_pymc_params_match_rv_ones(params, params, pm.Cauchy)
+        self._pymc_params_match_rv_ones(params, params, pm.Cauchy)
 
     def test_half_cauchy(self):
         params = [("alpha", 2.0), ("beta", 5.0)]
-        self.test_pymc_params_match_rv_ones(params, params, pm.HalfCauchy)
+        self._pymc_params_match_rv_ones(params, params, pm.HalfCauchy)
 
     @pytest.mark.skip(reason="Expected to fail due to bug")
     def test_gamma_alpha_beta(self):
         params = [("alpha", 2.0), ("beta", 5.0)]
         expected_params = [("alpha", params[0][1]), ("beta", 1 / params[1][1])]
-        self.test_pymc_params_match_rv_ones(params, expected_params, pm.Gamma)
+        self._pymc_params_match_rv_ones(params, expected_params, pm.Gamma)
 
     @pytest.mark.skip(reason="Expected to fail due to bug")
     def test_gamma_mu_sigma(self):
         params = [("mu", 2.0), ("sigma", 5.0)]
         expected_alpha, expected_beta = pm.Gamma.get_alpha_beta(mu=params[0][1], sigma=params[1][1])
         expected_params = [("alpha", expected_alpha), ("beta", 1 / expected_beta)]
-        self.test_pymc_params_match_rv_ones(params, expected_params, pm.Gamma)
+        self._pymc_params_match_rv_ones(params, expected_params, pm.Gamma)
+
+    def test_inverse_gamma_alpha_beta(self):
+        params = [("alpha", 2.0), ("beta", 5.0)]
+        self._pymc_params_match_rv_ones(params, params, pm.InverseGamma)
+
+    def test_inverse_gamma_mu_sigma(self):
+        params = [("mu", 2.0), ("sigma", 5.0)]
+        expected_alpha, expected_beta = pm.InverseGamma._get_alpha_beta(
+            mu=params[0][1], sigma=params[1][1], alpha=None, beta=None
+        )
+        expected_params = [("alpha", expected_alpha), ("beta", expected_beta)]
+        self._pymc_params_match_rv_ones(params, expected_params, pm.InverseGamma)
+
+    def test_binomial(self):
+        params = [("n", 100), ("p", 0.33)]
+        self._pymc_params_match_rv_ones(params, params, pm.Binomial)
+
+    def test_negative_binomial(self):
+        params = [("n", 100), ("p", 0.33)]
+        self._pymc_params_match_rv_ones(params, params, pm.NegativeBinomial)
+
+    def test_negative_binomial_mu_sigma(self):
+        params = [("mu", 5.0), ("alpha", 8.0)]
+        expected_n, expected_p = pm.NegativeBinomial.get_n_p(
+            mu=params[0][1], alpha=params[1][1], n=None, p=None
+        )
+        expected_params = [("n", expected_n), ("p", expected_p)]
+        self._pymc_params_match_rv_ones(params, expected_params, pm.NegativeBinomial)
+
+    def test_bernoulli(self):
+        params = [("p", 0.33)]
+        self._pymc_params_match_rv_ones(params, params, pm.Bernoulli)
+
+    def test_poisson(self):
+        params = [("mu", 4)]
+        self._pymc_params_match_rv_ones(params, params, pm.Poisson)
 
 
 class TestScalarParameterSamples(SeededTest):
@@ -706,13 +740,6 @@ def ref_rand(size, nu, mu, lam):
 
         pymc3_random(pm.StudentT, {"nu": Rplus, "mu": R, "lam": Rplus}, ref_rand=ref_rand)
 
-    @pytest.mark.skip(reason="This test is covered by Aesara")
-    def test_inverse_gamma(self):
-        def ref_rand(size, alpha, beta):
-            return st.invgamma.rvs(a=alpha, scale=beta, size=size)
-
-        pymc3_random(pm.InverseGamma, {"alpha": Rplus, "beta": Rplus}, ref_rand=ref_rand)
-
     @pytest.mark.xfail(reason="This distribution has not been refactored for v4")
     def test_pareto(self):
         def ref_rand(size, alpha, m):
@@ -759,10 +786,6 @@ def test_half_flat(self):
             with pytest.raises(ValueError):
                 f.random(1)
 
-    @pytest.mark.skip(reason="This test is covered by Aesara")
-    def test_binomial(self):
-        pymc3_random_discrete(pm.Binomial, {"n": Nat, "p": Unit}, ref_rand=st.binom.rvs)
-
     @pytest.mark.xfail(reason="This distribution has not been refactored for v4")
     @pytest.mark.xfail(
         sys.platform.startswith("win"),
@@ -776,29 +799,6 @@ def test_beta_binomial(self):
     def _beta_bin(self, n, alpha, beta, size=None):
         return st.binom.rvs(n, st.beta.rvs(a=alpha, b=beta, size=size))
 
-    @pytest.mark.skip(reason="This test is covered by Aesara")
-    def test_bernoulli(self):
-        pymc3_random_discrete(
-            pm.Bernoulli, {"p": Unit}, ref_rand=lambda size, p=None: st.bernoulli.rvs(p, size=size)
-        )
-
-    @pytest.mark.skip(reason="This test is covered by Aesara")
-    def test_poisson(self):
-        pymc3_random_discrete(pm.Poisson, {"mu": Rplusbig}, size=500, ref_rand=st.poisson.rvs)
-
-    @pytest.mark.skip(reason="This test is covered by Aesara")
-    def test_negative_binomial(self):
-        def ref_rand(size, alpha, mu):
-            return st.nbinom.rvs(alpha, alpha / (mu + alpha), size=size)
-
-        pymc3_random_discrete(
-            pm.NegativeBinomial,
-            {"mu": Rplusbig, "alpha": Rplusbig},
-            size=100,
-            fails=50,
-            ref_rand=ref_rand,
-        )
-
     @pytest.mark.xfail(reason="This distribution has not been refactored for v4")
     def test_geometric(self):
         pymc3_random_discrete(pm.Geometric, {"p": Unit}, size=500, fails=50, ref_rand=nr.geometric)

From a312231d67945abb2fdc2bc903c76b1cd09d6a0e Mon Sep 17 00:00:00 2001
From: drabbit17 <matteo.pallini17@gmail.com>
Date: Thu, 1 Apr 2021 23:44:26 +0100
Subject: [PATCH 43/44] Change tests for refactored distributions

More details on commit id 0773620b6f599423315035b97ef082ad32d98fd4
comment.
---
 pymc3/tests/test_distributions_random.py | 112 ++++++-----------------
 1 file changed, 29 insertions(+), 83 deletions(-)

diff --git a/pymc3/tests/test_distributions_random.py b/pymc3/tests/test_distributions_random.py
index 41d9f987b1..2006780a88 100644
--- a/pymc3/tests/test_distributions_random.py
+++ b/pymc3/tests/test_distributions_random.py
@@ -24,13 +24,13 @@
 import scipy.stats as st
 
 from numpy.testing import assert_almost_equal
-from scipy import linalg
 from scipy.special import expit
 
 import pymc3 as pm
 
 from pymc3.aesaraf import floatX, intX
 from pymc3.distributions import change_rv_size
+from pymc3.distributions.multivariate import quaddist_matrix
 from pymc3.distributions.shape_utils import to_tuple
 from pymc3.exceptions import ShapeError
 from pymc3.tests.helpers import SeededTest
@@ -41,7 +41,6 @@
     NatSmall,
     PdMatrix,
     PdMatrixChol,
-    PdMatrixCholUpper,
     R,
     RandomPdMatrix,
     RealMatrix,
@@ -644,6 +643,34 @@ def test_poisson(self):
         params = [("mu", 4)]
         self._pymc_params_match_rv_ones(params, params, pm.Poisson)
 
+    def test_mv_distribution(self):
+        params = [("mu", np.array([1.0, 2.0])), ("cov", np.array([[2.0, 0.0], [0.0, 3.5]]))]
+        self._pymc_params_match_rv_ones(params, params, pm.MvNormal)
+
+    def test_mv_distribution_chol(self):
+        params = [("mu", np.array([1.0, 2.0])), ("chol", np.array([[2.0, 0.0], [0.0, 3.5]]))]
+        expected_cov = quaddist_matrix(chol=params[1][1])
+        expected_params = [("mu", np.array([1.0, 2.0])), ("cov", expected_cov.eval())]
+        self._pymc_params_match_rv_ones(params, expected_params, pm.MvNormal)
+
+    def test_mv_distribution_tau(self):
+        params = [("mu", np.array([1.0, 2.0])), ("tau", np.array([[2.0, 0.0], [0.0, 3.5]]))]
+        expected_cov = quaddist_matrix(tau=params[1][1])
+        expected_params = [("mu", np.array([1.0, 2.0])), ("cov", expected_cov.eval())]
+        self._pymc_params_match_rv_ones(params, expected_params, pm.MvNormal)
+
+    def test_dirichlet(self):
+        params = [("a", np.array([1.0, 2.0]))]
+        self._pymc_params_match_rv_ones(params, params, pm.Dirichlet)
+
+    def test_multinomial(self):
+        params = [("n", 85), ("p", np.array([0.28, 0.62, 0.10]))]
+        self._pymc_params_match_rv_ones(params, params, pm.Multinomial)
+
+    def test_categorical(self):
+        params = [("p", np.array([0.28, 0.62, 0.10]))]
+        self._pymc_params_match_rv_ones(params, params, pm.Categorical)
+
 
 class TestScalarParameterSamples(SeededTest):
     @pytest.mark.xfail(reason="This distribution has not been refactored for v4")
@@ -840,14 +867,6 @@ def ref_rand(size, q, beta):
             pm.DiscreteWeibull, {"q": Unit, "beta": Rplusdunif}, ref_rand=ref_rand
         )
 
-    @pytest.mark.skip(reason="This test is covered by Aesara")
-    @pytest.mark.parametrize("s", [2, 3, 4])
-    def test_categorical_random(self, s):
-        def ref_rand(size, p):
-            return nr.choice(np.arange(p.shape[0]), p=p, size=size)
-
-        pymc3_random_discrete(pm.Categorical, {"p": Simplex(s)}, ref_rand=ref_rand)
-
     @pytest.mark.xfail(reason="This distribution has not been refactored for v4")
     def test_constant_dist(self):
         def ref_rand(size, c):
@@ -855,51 +874,6 @@ def ref_rand(size, c):
 
         pymc3_random_discrete(pm.Constant, {"c": I}, ref_rand=ref_rand)
 
-    @pytest.mark.skip(reason="This test is covered by Aesara")
-    def test_mv_normal(self):
-        def ref_rand(size, mu, cov):
-            return st.multivariate_normal.rvs(mean=mu, cov=cov, size=size)
-
-        def ref_rand_tau(size, mu, tau):
-            return ref_rand(size, mu, linalg.inv(tau))
-
-        def ref_rand_chol(size, mu, chol):
-            return ref_rand(size, mu, np.dot(chol, chol.T))
-
-        def ref_rand_uchol(size, mu, chol):
-            return ref_rand(size, mu, np.dot(chol.T, chol))
-
-        for n in [2, 3]:
-            pymc3_random(
-                pm.MvNormal,
-                {"mu": Vector(R, n), "cov": PdMatrix(n)},
-                size=100,
-                valuedomain=Vector(R, n),
-                ref_rand=ref_rand,
-            )
-            pymc3_random(
-                pm.MvNormal,
-                {"mu": Vector(R, n), "tau": PdMatrix(n)},
-                size=100,
-                valuedomain=Vector(R, n),
-                ref_rand=ref_rand_tau,
-            )
-            pymc3_random(
-                pm.MvNormal,
-                {"mu": Vector(R, n), "chol": PdMatrixChol(n)},
-                size=100,
-                valuedomain=Vector(R, n),
-                ref_rand=ref_rand_chol,
-            )
-            pymc3_random(
-                pm.MvNormal,
-                {"mu": Vector(R, n), "chol": PdMatrixCholUpper(n)},
-                size=100,
-                valuedomain=Vector(R, n),
-                ref_rand=ref_rand_uchol,
-                extra_args={"lower": False},
-            )
-
     @pytest.mark.xfail(reason="This distribution has not been refactored for v4")
     def test_matrix_normal(self):
         def ref_rand(size, mu, rowcov, colcov):
@@ -1042,20 +1016,6 @@ def ref_rand(size, nu, Sigma, mu):
                 ref_rand=ref_rand,
             )
 
-    @pytest.mark.skip(reason="This test is covered by Aesara")
-    def test_dirichlet(self):
-        def ref_rand(size, a):
-            return st.dirichlet.rvs(a, size=size)
-
-        for n in [2, 3]:
-            pymc3_random(
-                pm.Dirichlet,
-                {"a": Vector(Rplus, n)},
-                valuedomain=Simplex(n),
-                size=100,
-                ref_rand=ref_rand,
-            )
-
     @pytest.mark.xfail(reason="This distribution has not been refactored for v4")
     def test_dirichlet_multinomial(self):
         def ref_rand(size, a, n):
@@ -1123,20 +1083,6 @@ def test_dirichlet_multinomial_dist_ShapeError(self, n, a, shape, expectation):
         with expectation:
             m.random()
 
-    @pytest.mark.skip(reason="This test is covered by Aesara")
-    def test_multinomial(self):
-        def ref_rand(size, p, n):
-            return nr.multinomial(pvals=p, n=n, size=size)
-
-        for n in [2, 3]:
-            pymc3_random_discrete(
-                pm.Multinomial,
-                {"p": Simplex(n), "n": Nat},
-                valuedomain=Vector(Nat, n),
-                size=100,
-                ref_rand=ref_rand,
-            )
-
     @pytest.mark.xfail(reason="This distribution has not been refactored for v4")
     def test_gumbel(self):
         def ref_rand(size, mu, beta):

From 5008194b73055d2b948350305f1e2120453f2686 Mon Sep 17 00:00:00 2001
From: drabbit17 <matteo.pallini17@gmail.com>
Date: Fri, 2 Apr 2021 00:33:23 +0100
Subject: [PATCH 44/44] Remove tests for random variable samples shape and size

Most of the random variable logic has been moved to aesara, as well as
most of the relative tests. More details can be found on issue #4554
---
 pymc3/tests/test_distributions_random.py | 111 -----------------------
 1 file changed, 111 deletions(-)

diff --git a/pymc3/tests/test_distributions_random.py b/pymc3/tests/test_distributions_random.py
index 2006780a88..6ca9be2933 100644
--- a/pymc3/tests/test_distributions_random.py
+++ b/pymc3/tests/test_distributions_random.py
@@ -250,12 +250,6 @@ class TestGaussianRandomWalk(BaseTestCases.BaseTestCase):
     default_shape = (1,)
 
 
-@pytest.mark.skip(reason="This test is covered by Aesara")
-class TestNormal(BaseTestCases.BaseTestCase):
-    distribution = pm.Normal
-    params = {"mu": 0.0, "tau": 1.0}
-
-
 @pytest.mark.xfail(reason="This distribution has not been refactored for v4")
 class TestTruncatedNormal(BaseTestCases.BaseTestCase):
     distribution = pm.TruncatedNormal
@@ -280,18 +274,6 @@ class TestSkewNormal(BaseTestCases.BaseTestCase):
     params = {"mu": 0.0, "sigma": 1.0, "alpha": 5.0}
 
 
-@pytest.mark.skip(reason="This test is covered by Aesara")
-class TestHalfNormal(BaseTestCases.BaseTestCase):
-    distribution = pm.HalfNormal
-    params = {"tau": 1.0}
-
-
-@pytest.mark.skip(reason="This test is covered by Aesara")
-class TestUniform(BaseTestCases.BaseTestCase):
-    distribution = pm.Uniform
-    params = {"lower": 0.0, "upper": 1.0}
-
-
 @pytest.mark.xfail(reason="This distribution has not been refactored for v4")
 class TestTriangular(BaseTestCases.BaseTestCase):
     distribution = pm.Triangular
@@ -315,12 +297,6 @@ class TestKumaraswamy(BaseTestCases.BaseTestCase):
     params = {"a": 1.0, "b": 1.0}
 
 
-@pytest.mark.skip(reason="This test is covered by Aesara")
-class TestExponential(BaseTestCases.BaseTestCase):
-    distribution = pm.Exponential
-    params = {"lam": 1.0}
-
-
 @pytest.mark.xfail(reason="This distribution has not been refactored for v4")
 class TestLaplace(BaseTestCases.BaseTestCase):
     distribution = pm.Laplace
@@ -351,30 +327,6 @@ class TestPareto(BaseTestCases.BaseTestCase):
     params = {"alpha": 0.5, "m": 1.0}
 
 
-@pytest.mark.skip(reason="This test is covered by Aesara")
-class TestCauchy(BaseTestCases.BaseTestCase):
-    distribution = pm.Cauchy
-    params = {"alpha": 1.0, "beta": 1.0}
-
-
-@pytest.mark.skip(reason="This test is covered by Aesara")
-class TestHalfCauchy(BaseTestCases.BaseTestCase):
-    distribution = pm.HalfCauchy
-    params = {"beta": 1.0}
-
-
-@pytest.mark.skip(reason="This test is covered by Aesara")
-class TestGamma(BaseTestCases.BaseTestCase):
-    distribution = pm.Gamma
-    params = {"alpha": 1.0, "beta": 1.0}
-
-
-@pytest.mark.skip(reason="This test is covered by Aesara")
-class TestInverseGamma(BaseTestCases.BaseTestCase):
-    distribution = pm.InverseGamma
-    params = {"alpha": 0.5, "beta": 0.5}
-
-
 @pytest.mark.xfail(reason="This distribution has not been refactored for v4")
 class TestChiSquared(BaseTestCases.BaseTestCase):
     distribution = pm.ChiSquared
@@ -417,42 +369,18 @@ class TestLogitNormal(BaseTestCases.BaseTestCase):
     params = {"mu": 0.0, "sigma": 1.0}
 
 
-@pytest.mark.skip(reason="This test is covered by Aesara")
-class TestBinomial(BaseTestCases.BaseTestCase):
-    distribution = pm.Binomial
-    params = {"n": 5, "p": 0.5}
-
-
 @pytest.mark.xfail(reason="This distribution has not been refactored for v4")
 class TestBetaBinomial(BaseTestCases.BaseTestCase):
     distribution = pm.BetaBinomial
     params = {"n": 5, "alpha": 1.0, "beta": 1.0}
 
 
-@pytest.mark.skip(reason="This test is covered by Aesara")
-class TestBernoulli(BaseTestCases.BaseTestCase):
-    distribution = pm.Bernoulli
-    params = {"p": 0.5}
-
-
 @pytest.mark.xfail(reason="This distribution has not been refactored for v4")
 class TestDiscreteWeibull(BaseTestCases.BaseTestCase):
     distribution = pm.DiscreteWeibull
     params = {"q": 0.25, "beta": 2.0}
 
 
-@pytest.mark.skip(reason="This test is covered by Aesara")
-class TestPoisson(BaseTestCases.BaseTestCase):
-    distribution = pm.Poisson
-    params = {"mu": 1.0}
-
-
-@pytest.mark.skip(reason="This test is covered by Aesara")
-class TestNegativeBinomial(BaseTestCases.BaseTestCase):
-    distribution = pm.NegativeBinomial
-    params = {"mu": 1.0, "alpha": 1.0}
-
-
 @pytest.mark.xfail(reason="This distribution has not been refactored for v4")
 class TestConstant(BaseTestCases.BaseTestCase):
     distribution = pm.Constant
@@ -501,45 +429,6 @@ class TestMoyal(BaseTestCases.BaseTestCase):
     params = {"mu": 0.0, "sigma": 1.0}
 
 
-@pytest.mark.skip(reason="This test is covered by Aesara")
-class TestCategorical(BaseTestCases.BaseTestCase):
-    distribution = pm.Categorical
-    params = {"p": np.ones(BaseTestCases.BaseTestCase.shape)}
-
-    def get_random_variable(
-        self, shape, with_vector_params=False, **kwargs
-    ):  # don't transform categories
-        return super().get_random_variable(shape, with_vector_params=False, **kwargs)
-
-    def test_probability_vector_shape(self):
-        """Check that if a 2d array of probabilities are passed to categorical correct shape is returned"""
-        p = np.ones((10, 5))
-        assert pm.Categorical.dist(p=p).random().shape == (10,)
-        assert pm.Categorical.dist(p=p).random(size=4).shape == (4, 10)
-        p = np.ones((3, 7, 5))
-        assert pm.Categorical.dist(p=p).random().shape == (3, 7)
-        assert pm.Categorical.dist(p=p).random(size=4).shape == (4, 3, 7)
-
-
-@pytest.mark.skip(reason="This test is covered by Aesara")
-class TestDirichlet(SeededTest):
-    @pytest.mark.parametrize(
-        "shape, size",
-        [
-            ((2), (1)),
-            ((2), (2)),
-            ((2, 2), (2, 100)),
-            ((3, 4), (3, 4)),
-            ((3, 4), (3, 4, 100)),
-            ((3, 4), (100)),
-            ((3, 4), (1)),
-        ],
-    )
-    def test_dirichlet_random_shape(self, shape, size):
-        out_shape = to_tuple(size) + to_tuple(shape)
-        assert pm.Dirichlet.dist(a=np.ones(shape)).random(size=size).shape == out_shape
-
-
 class TestCorrectParametrizationMappingPymcToScipy(SeededTest):
     @staticmethod
     def get_inputs_from_apply_node_outputs(outputs):