pymc-devs · michaelosthege · Feb 5, 2023 · Feb 5, 2023 · Feb 5, 2023 · Feb 5, 2023
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -86,6 +86,7 @@ jobs:
             pymc/tests/step_methods/hmc/test_quadpotential.py
 
           - |
+            pymc/tests/backends/test_mcbackend.py
             pymc/tests/distributions/test_truncated.py
             pymc/tests/logprob/test_abstract.py
             pymc/tests/logprob/test_censoring.py

diff --git a/conda-envs/environment-dev.yml b/conda-envs/environment-dev.yml
@@ -41,3 +41,4 @@ dependencies:
 - pip:
   - git+https://github.com/pymc-devs/pymc-sphinx-theme
   - numdifftools>=0.9.40
+  - mcbackend>=0.3.0
diff --git a/conda-envs/environment-test.yml b/conda-envs/environment-test.yml
@@ -31,3 +31,4 @@ dependencies:
 - types-cachetools
 - pip:
   - numdifftools>=0.9.40
+  - mcbackend>=0.3.0
diff --git a/conda-envs/windows-environment-dev.yml b/conda-envs/windows-environment-dev.yml
@@ -38,3 +38,4 @@ dependencies:
 - pip:
   - git+https://github.com/pymc-devs/pymc-sphinx-theme
   - numdifftools>=0.9.40
+  - mcbackend>=0.3.0
diff --git a/conda-envs/windows-environment-test.yml b/conda-envs/windows-environment-test.yml
@@ -31,3 +31,4 @@ dependencies:
 - types-cachetools
 - pip:
   - numdifftools>=0.9.40
+  - mcbackend>=0.3.0
diff --git a/pymc/backends/__init__.py b/pymc/backends/__init__.py
@@ -69,7 +69,19 @@
 from pymc.backends.base import BaseTrace, IBaseTrace
 from pymc.backends.ndarray import NDArray
 from pymc.model import Model
-from pymc.step_methods.compound import BlockedStep, CompoundStep
+from pymc.step_methods.compound import BlockedStep, CompoundStep, StatsBijection
+
+HAS_MCB = False
+try:
+    from mcbackend import Backend, NumPyBackend
+
+    from pymc.backends.mcbackend import ChainRecordAdapter, make_runmeta
+
+    TraceOrBackend = Union[BaseTrace, Backend]
+    HAS_MCB = True
+except ImportError:
+    TraceOrBackend = BaseTrace  # type: ignore
+
 
 __all__ = ["to_inference_data", "predictions_to_inference_data"]
 
@@ -99,7 +111,7 @@ def _init_trace(
 
 def init_traces(
     *,
-    backend: Optional[BaseTrace],
+    backend: Optional[TraceOrBackend],
     chains: int,
     expected_length: int,
     step: Union[BlockedStep, CompoundStep],
@@ -108,6 +120,27 @@ def init_traces(
     model: Model,
 ) -> Sequence[IBaseTrace]:
     """Initializes a trace recorder for each chain."""
+    if HAS_MCB and backend is None:
+        backend = NumPyBackend(preallocate=expected_length)
+    if HAS_MCB and isinstance(backend, Backend):
+        run = backend.init_run(
+            make_runmeta(
+                var_dtypes=var_dtypes,
+                var_shapes=var_shapes,
+                step=step,
+                model=model,
+            )
+        )
+        statsbj = StatsBijection(step.stats_dtypes)
+        return [
+            ChainRecordAdapter(
+                chain=run.init_chain(chain_number=chain_number),
+                stats_bijection=statsbj,
+            )
+            for chain_number in range(chains)
+        ]
+
+    assert backend is None or isinstance(backend, BaseTrace)
     return [
         _init_trace(
             expected_length=expected_length,

diff --git a/pymc/backends/base.py b/pymc/backends/base.py
@@ -58,7 +58,7 @@ class IBaseTrace(ABC, Sized):
     varnames: List[str]
     """Names of tracked variables."""
 
-    sampler_vars: List[Dict[str, type]]
+    sampler_vars: List[Dict[str, Union[type, np.dtype]]]
     """Sampler stats for each sampler."""
 
     def __len__(self):
@@ -79,23 +79,27 @@ def get_values(self, varname: str, burn=0, thin=1) -> np.ndarray:
         """
         raise NotImplementedError()
 
-    def get_sampler_stats(self, stat_name: str, sampler_idx: Optional[int] = None, burn=0, thin=1):
+    def get_sampler_stats(
+        self, stat_name: str, sampler_idx: Optional[int] = None, burn=0, thin=1
+    ) -> np.ndarray:
         """Get sampler statistics from the trace.
 
         Parameters
         ----------
-        stat_name: str
-        sampler_idx: int or None
-        burn: int
-        thin: int
+        stat_name : str
+            Name of the stat to fetch.
+        sampler_idx : int or None
+            Index of the sampler to get the stat from.
+        burn : int
+            Draws to skip from the start.
+        thin : int
+            Stepsize for the slice.
 
         Returns
         -------
-        If the `sampler_idx` is specified, return the statistic with
-        the given name in a numpy array. If it is not specified and there
-        is more than one sampler that provides this statistic, return
-        a numpy array of shape (m, n), where `m` is the number of
-        such samplers, and `n` is the number of samples.
+        stats : np.ndarray
+            If `sampler_idx` was specified, the shape should be `(draws, samplers)`.
+            Otherwise, the shape should be `(draws,)`.
         """
         raise NotImplementedError()
 
@@ -220,23 +224,31 @@ def __getitem__(self, idx):
         except (ValueError, TypeError):  # Passed variable or variable name.
             raise ValueError("Can only index with slice or integer")
 
-    def get_sampler_stats(self, stat_name, sampler_idx=None, burn=0, thin=1):
+    def get_sampler_stats(
+        self, stat_name: str, sampler_idx: Optional[int] = None, burn=0, thin=1
+    ) -> np.ndarray:
         """Get sampler statistics from the trace.
 
+        Note: This implementation attempts to squeeze object arrays into a consistent dtype,
+        #     which can change their shape in hard-to-predict ways.
+        #     See https://github.com/pymc-devs/pymc/issues/6207
+
         Parameters
         ----------
-        stat_name: str
-        sampler_idx: int or None
-        burn: int
-        thin: int
+        stat_name : str
+            Name of the stat to fetch.
+        sampler_idx : int or None
+            Index of the sampler to get the stat from.
+        burn : int
+            Draws to skip from the start.
+        thin : int
+            Stepsize for the slice.
 
         Returns
         -------
-        If the `sampler_idx` is specified, return the statistic with
-        the given name in a numpy array. If it is not specified and there
-        is more than one sampler that provides this statistic, return
-        a numpy array of shape (m, n), where `m` is the number of
-        such samplers, and `n` is the number of samples.
+        stats : np.ndarray
+            If `sampler_idx` was specified, the shape should be `(draws, samplers)`.
+            Otherwise, the shape should be `(draws,)`.
         """
         if sampler_idx is not None:
             return self._get_sampler_stats(stat_name, sampler_idx, burn, thin)
@@ -254,14 +266,16 @@ def get_sampler_stats(self, stat_name, sampler_idx=None, burn=0, thin=1):
 
         if vals.dtype == np.dtype(object):
             try:
-                vals = np.vstack(vals)
+                vals = np.vstack(list(vals))
             except ValueError:
                 # Most likely due to non-identical shapes. Just stick with the object-array.
                 pass
 
         return vals
 
-    def _get_sampler_stats(self, stat_name, sampler_idx, burn, thin):
+    def _get_sampler_stats(
+        self, stat_name: str, sampler_idx: int, burn: int, thin: int
+    ) -> np.ndarray:
         """Get sampler statistics."""
         raise NotImplementedError()
 
@@ -476,23 +490,34 @@ def get_sampler_stats(
         combine: bool = True,
         chains: Optional[Union[int, Sequence[int]]] = None,
         squeeze: bool = True,
-    ):
+    ) -> Union[List[np.ndarray], np.ndarray]:
         """Get sampler statistics from the trace.
 
+        Note: This implementation attempts to squeeze object arrays into a consistent dtype,
+        #     which can change their shape in hard-to-predict ways.
+        #     See https://github.com/pymc-devs/pymc/issues/6207
+
         Parameters
         ----------
-        stat_name: str
-        sampler_idx: int or None
-        burn: int
-        thin: int
+        stat_name : str
+            Name of the stat to fetch.
+        sampler_idx : int or None
+            Index of the sampler to get the stat from.
+        burn : int
+            Draws to skip from the start.
+        thin : int
+            Stepsize for the slice.
+        combine : bool
+            If True, results from `chains` will be concatenated.
+        squeeze : bool
+            Return a single array element if the resulting list of
+            values only has one element. If False, the result will
+            always be a list of arrays, even if `combine` is True.
 
         Returns
         -------
-        If the `sampler_idx` is specified, return the statistic with
-        the given name in a numpy array. If it is not specified and there
-        is more than one sampler that provides this statistic, return
-        a numpy array of shape (m, n), where `m` is the number of
-        such samplers, and `n` is the number of samples.
+        stats : np.ndarray
+            List or ndarray depending on parameters.
         """
         if stat_name not in self.stat_names:
             raise KeyError("Unknown sampler statistic %s" % stat_name)
@@ -543,7 +568,7 @@ def points(self, chains=None):
         return itl.chain.from_iterable(self._straces[chain] for chain in chains)
 
 
-def _squeeze_cat(results, combine, squeeze):
+def _squeeze_cat(results, combine: bool, squeeze: bool):
     """Squeeze and concatenate the results depending on values of
     `combine` and `squeeze`."""
     if combine: