diff --git a/mcbackend/__init__.py b/mcbackend/__init__.py index 40a0877..a0eb735 100644 --- a/mcbackend/__init__.py +++ b/mcbackend/__init__.py @@ -2,6 +2,7 @@ A framework agnostic implementation for storage of MCMC draws. """ +from .backends.null import NullBackend from .backends.numpy import NumPyBackend from .core import Backend, Chain, Run from .meta import ChainMeta, Coordinate, DataVariable, ExtendedValue, RunMeta, Variable @@ -16,6 +17,7 @@ __version__ = "0.5.2" __all__ = [ "NumPyBackend", + "NullBackend", "Backend", "Chain", "Run", diff --git a/mcbackend/backends/null.py b/mcbackend/backends/null.py new file mode 100644 index 0000000..03cb9b0 --- /dev/null +++ b/mcbackend/backends/null.py @@ -0,0 +1,117 @@ +""" +This backend simply discards draws. There are not stored in memory. +This can be used in situations where we want to run an MCMC but not permanently +store its output. +""" + +# Code-wise, a NullChain is essentially just a NumpyChain without the underlying data array. + +from typing import Dict, List, Mapping, Optional, Sequence, Tuple + +import numpy + +from ..core import Backend, Chain, Run +from ..meta import ChainMeta, RunMeta +from .numpy import grow_append, prepare_storage + + +class NullChain(Chain): + """A null storage: discards values immediately and allocates no memory. + + Use cases are + + - Online computations: Draws are used and discarded immediately, allowing for much larger sample spaces. + - Profiling: To use as a baseline, to measure compute time & memory before allocating memory for draws. + Comparing with another backend would then show how much overhead it adds. + + Since draws are not stored, only a subset of the `Chain` interface is supported: + + - Supported: `__len__`, `append`, `get_stats`, `get_stats_at` + - Not supported: `get_draws`, `get_draws_at` + + .. Todo:: Option to also sampling stats? + .. Todo:: Allow retrieving the most recent draw? + + """ + + def __init__(self, cmeta: ChainMeta, rmeta: RunMeta, *, preallocate: int) -> None: + """Creates a null storage for draws from a chain: will gobble outputs without storing them + + Parameters + ---------- + cmeta : ChainMeta + Metadata of the chain. + rmeta : RunMeta + Metadata of the MCMC run. + preallocate : int + Influences the memory pre-allocation behavior. + (Draws are not saved, but stats may still be.) + The default is to reserve memory for ``preallocate`` draws + and grow the allocated memory by 10 % when needed. + Exceptions are variables with non-rigid shapes (indicated by 0 in the shape tuple) + where the correct amount of memory cannot be pre-allocated. + In these cases object arrays are used. + """ + self._draw_idx = 0 + + # Create storage ndarrays only for sampler stats. + self._stats, self._stat_is_rigid = prepare_storage(rmeta.sample_stats, preallocate) + + super().__init__(cmeta, rmeta) + + def append( # pylint: disable=duplicate-code + self, draw: Mapping[str, numpy.ndarray], stats: Optional[Mapping[str, numpy.ndarray]] = None + ): + if stats: + grow_append(self._stats, stats, self._stat_is_rigid, self._draw_idx) + self._draw_idx += 1 + return + + def __len__(self) -> int: + return self._draw_idx + + def get_draws(self, var_name: str, slc: slice = slice(None)) -> numpy.ndarray: + raise RuntimeError("NullChain does not save draws.") + + def get_draws_at(self, idx: int, var_names: Sequence[str]) -> Dict[str, numpy.ndarray]: + raise RuntimeError("NullChain does not save draws.") + + def get_stats( # pylint: disable=duplicate-code + self, stat_name: str, slc: slice = slice(None) + ) -> numpy.ndarray: + data = self._stats[stat_name][: self._draw_idx][slc] + if self.sample_stats[stat_name].dtype == "str": + return numpy.array(data.tolist(), dtype=str) + return data + + def get_stats_at(self, idx: int, stat_names: Sequence[str]) -> Dict[str, numpy.ndarray]: + return {sn: numpy.asarray(self._stats[sn][idx]) for sn in stat_names} + + +class NullRun(Run): + """An MCMC run where samples are immediately discarded.""" + + def __init__(self, meta: RunMeta, *, preallocate: int) -> None: + self._settings = {"preallocate": preallocate} + self._chains: List[NullChain] = [] + super().__init__(meta) + + def init_chain(self, chain_number: int) -> NullChain: + cmeta = ChainMeta(self.meta.rid, chain_number) + chain = NullChain(cmeta, self.meta, **self._settings) + self._chains.append(chain) + return chain + + def get_chains(self) -> Tuple[NullChain, ...]: + return tuple(self._chains) + + +class NullBackend(Backend): + """A backend which discards samples immediately.""" + + def __init__(self, preallocate: int = 1_000) -> None: + self._settings = {"preallocate": preallocate} + super().__init__() + + def init_run(self, meta: RunMeta) -> NullRun: + return NullRun(meta, **self._settings) diff --git a/mcbackend/backends/numpy.py b/mcbackend/backends/numpy.py index 6245b4c..c4e17c8 100644 --- a/mcbackend/backends/numpy.py +++ b/mcbackend/backends/numpy.py @@ -3,12 +3,12 @@ """ import math -from typing import Dict, List, Mapping, Optional, Sequence, Tuple +from typing import Dict, Iterable, List, Mapping, Optional, Sequence, Tuple import numpy from ..core import Backend, Chain, Run, is_rigid -from ..meta import ChainMeta, RunMeta +from ..meta import ChainMeta, RunMeta, Variable def grow_append( @@ -34,6 +34,22 @@ def grow_append( return +def prepare_storage( + variables: Iterable[Variable], preallocate: int +) -> Tuple[Dict[str, numpy.ndarray], Dict[str, bool]]: + storage: Dict[str, numpy.ndarray] = {} + rigid_dict: Dict[str, bool] = {} + for var in variables: + rigid = is_rigid(var.shape) and not var.undefined_ndim and var.dtype != "str" + rigid_dict[var.name] = rigid + if rigid: + reserve = (preallocate, *var.shape) + storage[var.name] = numpy.empty(reserve, var.dtype) + else: + storage[var.name] = numpy.array([None] * preallocate, dtype=object) + return storage, rigid_dict + + class NumPyChain(Chain): """Stores value draws in NumPy arrays and can pre-allocate memory.""" @@ -54,25 +70,11 @@ def __init__(self, cmeta: ChainMeta, rmeta: RunMeta, *, preallocate: int) -> Non where the correct amount of memory cannot be pre-allocated. In these cases object arrays are used. """ - self._var_is_rigid: Dict[str, bool] = {} - self._samples: Dict[str, numpy.ndarray] = {} - self._stat_is_rigid: Dict[str, bool] = {} - self._stats: Dict[str, numpy.ndarray] = {} self._draw_idx = 0 # Create storage ndarrays for each model variable and sampler stat. - for target_dict, rigid_dict, variables in [ - (self._samples, self._var_is_rigid, rmeta.variables), - (self._stats, self._stat_is_rigid, rmeta.sample_stats), - ]: - for var in variables: - rigid = is_rigid(var.shape) and not var.undefined_ndim and var.dtype != "str" - rigid_dict[var.name] = rigid - if rigid: - reserve = (preallocate, *var.shape) - target_dict[var.name] = numpy.empty(reserve, var.dtype) - else: - target_dict[var.name] = numpy.array([None] * preallocate, dtype=object) + self._samples, self._var_is_rigid = prepare_storage(rmeta.variables, preallocate) + self._stats, self._stat_is_rigid = prepare_storage(rmeta.sample_stats, preallocate) super().__init__(cmeta, rmeta) diff --git a/mcbackend/test_backend_null.py b/mcbackend/test_backend_null.py new file mode 100644 index 0000000..033ab64 --- /dev/null +++ b/mcbackend/test_backend_null.py @@ -0,0 +1,221 @@ +import random + +import hagelkorn +import numpy +import pytest + +from mcbackend.backends.null import NullBackend, NullChain, NullRun +from mcbackend.core import RunMeta, is_rigid +from mcbackend.meta import Variable +from mcbackend.test_utils import ( + CheckBehavior, + CheckPerformance, + make_draw, + make_runmeta, +) + + +class CheckNullBehavior(CheckBehavior): + """ + Overrides tests which assert that data are recorded correctly + We perform all the operations of the original test, but in the + end we do the opposite: assert that an exception is raised + when either `get_draws` or `get_draws_at` is called. + Stats are still recorded, so that part of the tests is reproduced unchanged. + """ + + @pytest.mark.parametrize("with_stats", [False, True]) + def test__append_get_at(self, with_stats): + rmeta = make_runmeta() + run = self.backend.init_run(rmeta) + chain = run.init_chain(7) + + # Generate data + draw = make_draw(rmeta.variables) + stats = make_draw(rmeta.sample_stats) if with_stats else None + + # Append to the chain + assert len(chain) == 0 + chain.append(draw, stats) + assert len(chain) == 1 + + # Retrieve by index - Raises exception + with pytest.raises(RuntimeError): + chain.get_draws_at(0, [v.name for v in rmeta.variables]) + + # NB: Stats are still recorded and can be retrieved as with other chains + if with_stats: + actual = chain.get_stats_at(0, [v.name for v in rmeta.sample_stats]) + assert isinstance(actual, dict) + assert set(actual) == set(stats) + for vn, act in actual.items(): + numpy.testing.assert_array_equal(act, stats[vn]) + pass + + @pytest.mark.parametrize("with_stats", [False, True]) + def test__append_get_with_changelings(self, with_stats): + rmeta = make_runmeta(flexibility=True) + run = self.backend.init_run(rmeta) + chain = run.init_chain(7) + + # Generate draws and add them to the chain + n = 10 + draws = [make_draw(rmeta.variables) for _ in range(n)] + if with_stats: + stats = [make_draw(rmeta.sample_stats) for _ in range(n)] + else: + stats = [None] * n + + for d, s in zip(draws, stats): + chain.append(d, s) + + # Fetching variables raises exception + for var in rmeta.variables: + expected = [draw[var.name] for draw in draws] + with pytest.raises(RuntimeError): + chain.get_draws(var.name) + + if with_stats: + for var in rmeta.sample_stats: + expected = [stat[var.name] for stat in stats] + actual = chain.get_stats(var.name) + assert isinstance(actual, numpy.ndarray) + if var.dtype == "str": + assert tuple(actual.shape) == tuple(numpy.shape(expected)) + # String dtypes have strange names + assert "str" in actual.dtype.name + elif is_rigid(var.shape): + assert tuple(actual.shape) == tuple(numpy.shape(expected)) + assert actual.dtype.name == var.dtype + numpy.testing.assert_array_equal(actual, expected) + else: + # Non-ridid variables are returned as object-arrays. + assert actual.shape == (len(expected),) + assert actual.dtype == object + # Their values must be asserted elementwise to avoid shape problems. + for act, exp in zip(actual, expected): + numpy.testing.assert_array_equal(act, exp) + pass + + @pytest.mark.parametrize( + "slc", + [ + None, + slice(None, None, None), + slice(2, None, None), + slice(2, 10, None), + slice(2, 15, 3), # every 3rd + slice(15, 2, -3), # backwards every 3rd + slice(2, 15, -3), # empty + slice(-8, None, None), # the last 8 + slice(-8, -2, 2), + slice(-50, -2, 2), + slice(15, 10), # empty + slice(1, 1), # empty + ], + ) + def test__get_slicing(self, slc: slice): + # "A" are just numbers to make diagnosis easier. + # "B" are dynamically shaped to cover the edge cases. + rmeta = RunMeta( + variables=[Variable("A", "uint8"), Variable("M", "str", [2, 3])], + sample_stats=[Variable("B", "uint8", [2, -1])], + data=[], + ) + run = self.backend.init_run(rmeta) + chain = run.init_chain(0) + + # Generate draws and add them to the chain + N = 20 + draws = [make_draw(rmeta.variables) for n in range(N)] + stats = [make_draw(rmeta.sample_stats) for n in range(N)] + for d, s in zip(draws, stats): + chain.append(d, s) + assert len(chain) == N + + # slc=None in this test means "don't pass it". + # The implementations should default to slc=slice(None, None, None). + kwargs = dict(slc=slc) if slc is not None else {} + with pytest.raises(RuntimeError): + chain.get_draws("A", **kwargs) + with pytest.raises(RuntimeError): + chain.get_draws("M", **kwargs) + act_stats = chain.get_stats("B", **kwargs) + expected_stats = [s["B"] for s in stats][slc or slice(None, None, None)] + + # Stat "B" is dynamically shaped, which means we're dealing with + # dtype=object arrays. These must be checked elementwise. + assert len(act_stats) == len(expected_stats) + assert act_stats.dtype == object + for a, e in zip(act_stats, expected_stats): + numpy.testing.assert_array_equal(a, e) + pass + + def test__to_inferencedata(self): + """ + NullBackend doesn’t support `to_inferencedata`, so there isn’t + anything to test here. + """ + pass + + +class TestNullBackend(CheckNullBehavior, CheckPerformance): + cls_backend = NullBackend + cls_run = NullRun + cls_chain = NullChain + + # `test_targets` and `test_growing` are copied over from TestNumPyBackend. + # The lines testing sample storage removed, since neither `_samples` + # nor `_var_is_rigid` are not supported by NullBackend. + # However if one were to add tests for `_stats` and `_stat_is_rigid` + # to the NumPy suite, we could port those here. + + def test_targets(self): + imb = NullBackend(preallocate=123) + rm = RunMeta( + rid=hagelkorn.random(), + variables=[ + Variable("tensor", "int8", (3, 4, 5)), + Variable("scalar", "float64", ()), + Variable("changeling", "uint16", (3, -1)), + ], + ) + run = imb.init_run(rm) + chain = run.init_chain(0) + pass + + @pytest.mark.parametrize("preallocate", [0, 75]) + def test_growing(self, preallocate): + imb = NullBackend(preallocate=preallocate) + rm = RunMeta( + rid=hagelkorn.random(), + variables=[ + Variable( + "A", + "float32", + (2,), + ), + Variable( + "B", + "float32", + (-1,), + ), + ], + ) + run = imb.init_run(rm) + chain = run.init_chain(0) + # TODO: Check dimensions of stats array ? + for _ in range(130): + draw = { + "A": numpy.random.uniform(size=(2,)), + "B": numpy.random.uniform(size=(random.randint(0, 10),)), + } + chain.append(draw) + # TODO: Check dimensions of stats array ? + pass + + +if __name__ == "__main__": + tc = TestNullBackend() + df = tc.run_all_benchmarks() + print(df)