diff --git a/pymc/data.py b/pymc/data.py index 0f1a54d09c..c03461cc5b 100644 --- a/pymc/data.py +++ b/pymc/data.py @@ -20,7 +20,7 @@ import warnings from copy import copy -from typing import Any, Dict, List, Optional, Sequence, Union, cast +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union, cast import aesara import aesara.tensor as at @@ -466,9 +466,15 @@ def align_minibatches(batches=None): rng.seed() -def determine_coords(model, value, dims: Optional[Sequence[str]] = None) -> Dict[str, Sequence]: +def determine_coords( + model, + value, + dims: Optional[Sequence[Optional[str]]] = None, + coords: Optional[Dict[str, Sequence]] = None, +) -> Tuple[Dict[str, Sequence], Sequence[Optional[str]]]: """Determines coordinate values from data or the model (via ``dims``).""" - coords = {} + if coords is None: + coords = {} # If value is a df or a series, we interpret the index as coords: if hasattr(value, "index"): @@ -499,10 +505,14 @@ def determine_coords(model, value, dims: Optional[Sequence[str]] = None) -> Dict ) for size, dim in zip(value.shape, dims): coord = model.coords.get(dim, None) - if coord is None: + if coord is None and dim is not None: coords[dim] = range(size) - return coords + if dims is None: + # TODO: Also determine dim names from the index + dims = [None] * np.ndim(value) + + return coords, dims def ConstantData( @@ -510,6 +520,7 @@ def ConstantData( value, *, dims: Optional[Sequence[str]] = None, + coords: Optional[Dict[str, Sequence]] = None, export_index_as_coords=False, **kwargs, ) -> TensorConstant: @@ -522,6 +533,7 @@ def ConstantData( name, value, dims=dims, + coords=coords, export_index_as_coords=export_index_as_coords, mutable=False, **kwargs, @@ -534,6 +546,7 @@ def MutableData( value, *, dims: Optional[Sequence[str]] = None, + coords: Optional[Dict[str, Sequence]] = None, export_index_as_coords=False, **kwargs, ) -> SharedVariable: @@ -546,6 +559,7 @@ def MutableData( name, value, dims=dims, + coords=coords, export_index_as_coords=export_index_as_coords, mutable=True, **kwargs, @@ -558,6 +572,7 @@ def Data( value, *, dims: Optional[Sequence[str]] = None, + coords: Optional[Dict[str, Sequence]] = None, export_index_as_coords=False, mutable: Optional[bool] = None, **kwargs, @@ -588,9 +603,11 @@ def Data( :ref:`arviz:quickstart`. If this parameter is not specified, the random variables will not have dimension names. + coords : dict, optional + Coordinate values to set for new dimensions introduced by this ``Data`` variable. export_index_as_coords : bool, default=False - If True, the ``Data`` container will try to infer what the coordinates should be - if there is an index in ``value``. + If True, the ``Data`` container will try to infer what the coordinates + and dimension names should be if there is an index in ``value``. mutable : bool, optional Switches between creating a :class:`~aesara.compile.sharedvalue.SharedVariable` (``mutable=True``) vs. creating a :class:`~aesara.tensor.TensorConstant` @@ -624,6 +641,9 @@ def Data( ... model.set_data('data', data_vals) ... idatas.append(pm.sample()) """ + if coords is None: + coords = {} + if isinstance(value, list): value = np.array(value) @@ -665,15 +685,28 @@ def Data( expected=x.ndim, ) - coords = determine_coords(model, value, dims) - + # Optionally infer coords and dims from the input value. if export_index_as_coords: - model.add_coords(coords) - elif dims: + coords, dims = determine_coords(model, value, dims) + + if dims: + if not mutable: + # Use the dimension lengths from the before it was tensorified. + # These can still be tensors, but in many cases they are numeric. + xshape = np.shape(arr) + else: + xshape = x.shape # Register new dimension lengths for d, dname in enumerate(dims): if not dname in model.dim_lengths: - model.add_coord(dname, values=None, length=x.shape[d]) + model.add_coord( + name=dname, + # Note: Coordinate values can't be taken from + # the value, because it could be N-dimensional. + values=coords.get(dname, None), + mutable=mutable, + length=xshape[d], + ) model.add_random_variable(x, dims=dims) diff --git a/pymc/model.py b/pymc/model.py index a3c20bb81c..59ae618b8d 100644 --- a/pymc/model.py +++ b/pymc/model.py @@ -61,7 +61,7 @@ from pymc.distributions import joint_logpt from pymc.distributions.logprob import _get_scaling from pymc.distributions.transforms import _default_transform -from pymc.exceptions import ImputationWarning, SamplingError, ShapeError, ShapeWarning +from pymc.exceptions import ImputationWarning, SamplingError, ShapeError from pymc.initial_point import make_initial_point_fn from pymc.math import flatten_list from pymc.util import ( @@ -1071,8 +1071,9 @@ def add_coord( self, name: str, values: Optional[Sequence] = None, + mutable: bool = False, *, - length: Optional[Variable] = None, + length: Optional[Union[int, Variable]] = None, ): """Registers a dimension coordinate with the model. @@ -1084,9 +1085,12 @@ def add_coord( values : optional, array-like Coordinate values or ``None`` (for auto-numbering). If ``None`` is passed, a ``length`` must be specified. + mutable : bool + Whether the created dimension should be resizable. + Default is False. length : optional, scalar - A symbolic scalar of the dimensions length. - Defaults to ``aesara.shared(len(values))``. + A scalar of the dimensions length. + Defaults to ``aesara.tensor.constant(len(values))``. """ if name in {"draw", "chain", "__sample__"}: raise ValueError( @@ -1097,7 +1101,9 @@ def add_coord( raise ValueError( f"Either `values` or `length` must be specified for the '{name}' dimension." ) - if length is not None and not isinstance(length, Variable): + if isinstance(length, int): + length = at.constant(length) + elif length is not None and not isinstance(length, Variable): raise ValueError( f"The `length` passed for the '{name}' coord must be an Aesara Variable or None." ) @@ -1109,14 +1115,17 @@ def add_coord( if not np.array_equal(values, self.coords[name]): raise ValueError(f"Duplicate and incompatible coordinate: {name}.") else: + if mutable: + self._dim_lengths[name] = length or aesara.shared(len(values)) + else: + self._dim_lengths[name] = length or aesara.tensor.constant(len(values)) self._coords[name] = values - self._dim_lengths[name] = length or aesara.shared(len(values)) def add_coords( self, coords: Dict[str, Optional[Sequence]], *, - lengths: Optional[Dict[str, Union[Variable, None]]] = None, + lengths: Optional[Dict[str, Optional[Union[int, Variable]]]] = None, ): """Vectorized version of ``Model.add_coord``.""" if coords is None: @@ -1180,24 +1189,20 @@ def set_data( # NOTE: If there are multiple pm.MutableData containers sharing this dim, but the user only # changes the values for one of them, they will run into shape problems nonetheless. if length_changed: + if original_coords is not None: + if new_coords is None: + raise ValueError( + f"The '{name}' variable already had {len(original_coords)} coord values defined for " + f"its {dname} dimension. With the new values this dimension changes to length " + f"{new_length}, so new coord values for the {dname} dimension are required." + ) if isinstance(length_tensor, TensorConstant): raise ShapeError( f"Resizing dimension '{dname}' is impossible, because " - f"a 'TensorConstant' stores its length. To be able " - f"to change the dimension length, 'fixed' in " - f"'model.add_coord' must be set to `False`." - ) - if length_tensor.owner is None: - # This is the case if the dimension was initialized - # from custom coords, but dimension length was not - # stored in TensorConstant e.g by 'fixed' set to False - - warnings.warn( - f"You're changing the shape of a variable " - f"in the '{dname}' dimension which was initialized " - f"from coords. Make sure to update the corresponding " - f"coords, otherwise you'll get shape issues.", - ShapeWarning, + "a 'TensorConstant' stores its length. To be able " + "to change the dimension length, pass `mutable=True` when " + "registering the dimension via `model.add_coord`, " + "or define it via a `pm.MutableData` variable." ) else: length_belongs_to = length_tensor.owner.inputs[0].owner.inputs[0] @@ -1210,13 +1215,6 @@ def set_data( actual=new_length, expected=old_length, ) - if original_coords is not None: - if new_coords is None: - raise ValueError( - f"The '{name}' variable already had {len(original_coords)} coord values defined for " - f"its {dname} dimension. With the new values this dimension changes to length " - f"{new_length}, so new coord values for the {dname} dimension are required." - ) if isinstance(length_tensor, ScalarSharedVariable): # Updating the shared variable resizes dependent nodes that use this dimension for their `size`. length_tensor.set_value(new_length) diff --git a/pymc/tests/test_data_container.py b/pymc/tests/test_data_container.py index 4398e9f6f5..2983c26ddd 100644 --- a/pymc/tests/test_data_container.py +++ b/pymc/tests/test_data_container.py @@ -19,7 +19,7 @@ from aesara import shared from aesara.compile.sharedvalue import SharedVariable -from aesara.tensor.sharedvar import ScalarSharedVariable +from aesara.tensor import TensorConstant from aesara.tensor.var import TensorVariable import pymc as pm @@ -315,29 +315,47 @@ def test_explicit_coords(self): } # pass coordinates explicitly, use numpy array in Data container with pm.Model(coords=coords) as pmodel: + # Dims created from coords are constant by default + assert isinstance(pmodel.dim_lengths["rows"], TensorConstant) + assert isinstance(pmodel.dim_lengths["columns"], TensorConstant) pm.MutableData("observations", data, dims=("rows", "columns")) - # new data with same shape + # new data with same (!) shape pm.set_data({"observations": data + 1}) - # new data with same shape and coords + # new data with same (!) shape and coords pm.set_data({"observations": data}, coords=coords) assert "rows" in pmodel.coords assert pmodel.coords["rows"] == ("R1", "R2", "R3", "R4", "R5") assert "rows" in pmodel.dim_lengths - assert isinstance(pmodel.dim_lengths["rows"], ScalarSharedVariable) assert pmodel.dim_lengths["rows"].eval() == 5 assert "columns" in pmodel.coords assert pmodel.coords["columns"] == ("C1", "C2", "C3", "C4", "C5", "C6", "C7") assert pmodel.RV_dims == {"observations": ("rows", "columns")} assert "columns" in pmodel.dim_lengths - assert isinstance(pmodel.dim_lengths["columns"], ScalarSharedVariable) assert pmodel.dim_lengths["columns"].eval() == 7 + def test_set_coords_through_pmdata(self): + with pm.Model() as pmodel: + pm.ConstantData( + "population", [100, 200], dims="city", coords={"city": ["Tinyvil", "Minitown"]} + ) + pm.MutableData( + "temperature", + [[15, 20, 22, 17], [18, 22, 21, 12]], + dims=("city", "season"), + coords={"season": ["winter", "spring", "summer", "fall"]}, + ) + assert "city" in pmodel.coords + assert "season" in pmodel.coords + assert pmodel.coords["city"] == ("Tinyvil", "Minitown") + assert pmodel.coords["season"] == ("winter", "spring", "summer", "fall") + def test_symbolic_coords(self): """ In v4 dimensions can be created without passing coordinate values. Their lengths are then automatically linked to the corresponding Tensor dimension. """ with pm.Model() as pmodel: + # Dims created from MutableData are TensorVariables linked to the SharedVariable.shape intensity = pm.MutableData("intensity", np.ones((2, 3)), dims=("row", "column")) assert "row" in pmodel.dim_lengths assert "column" in pmodel.dim_lengths diff --git a/pymc/tests/test_model.py b/pymc/tests/test_model.py index 50b06e24b0..6a01bc28d0 100644 --- a/pymc/tests/test_model.py +++ b/pymc/tests/test_model.py @@ -27,7 +27,9 @@ import scipy.sparse as sps import scipy.stats as st +from aesara.tensor import TensorVariable from aesara.tensor.random.op import RandomVariable +from aesara.tensor.sharedvar import ScalarSharedVariable from aesara.tensor.var import TensorConstant import pymc as pm @@ -702,6 +704,67 @@ def test_nested_model_coords(): assert set(m2.RV_dims) < set(m1.RV_dims) +def test_shapeerror_from_resize_immutable_dims(): + """ + Trying to resize an immutable dimension should raise a ShapeError. + Even if the variable being updated is a SharedVariable and has other + dimensions that are mutable. + """ + with pm.Model() as pmodel: + a = pm.Normal("a", mu=[1, 2, 3], dims="fixed") + + m = pm.MutableData("m", [[1, 2, 3]], dims=("one", "fixed")) + + # This is fine because the "fixed" dim is not resized + pm.set_data({"m": [[1, 2, 3], [3, 4, 5]]}) + + with pytest.raises(ShapeError, match="was initialized from 'a'"): + # Can't work because the "fixed" dimension is linked to a constant shape: + # Note that the new data tries to change both dimensions + with pmodel: + pm.set_data({"m": [[1, 2], [3, 4]]}) + + +def test_valueerror_from_resize_without_coords_update(): + """ + Resizing a mutable dimension that had coords, + without passing new coords raises a ValueError. + """ + with pm.Model() as pmodel: + pmodel.add_coord("shared", [1, 2, 3], mutable=True) + pm.MutableData("m", [1, 2, 3], dims=("shared")) + with pytest.raises(ValueError, match="'m' variable already had 3"): + # tries to resize m but without passing coords so raise ValueError + pm.set_data({"m": [1, 2, 3, 4]}) + + +def test_coords_and_constantdata_create_immutable_dims(): + """ + When created from `pm.Model(coords=...)` or `pm.ConstantData` + a dimension should be resizable. + """ + with pm.Model(coords={"group": ["A", "B"]}) as m: + x = pm.ConstantData("x", [0], dims="feature") + y = pm.Normal("y", x, 1, dims=("group", "feature")) + assert isinstance(m._dim_lengths["feature"], TensorConstant) + assert isinstance(m._dim_lengths["group"], TensorConstant) + assert x.eval().shape == (1,) + assert y.eval().shape == (2, 1) + + +def test_add_coord_mutable_kwarg(): + """ + Checks resulting tensor type depending on mutable kwarg in add_coord. + """ + with pm.Model() as m: + m.add_coord("fixed", values=[1], mutable=False) + m.add_coord("mutable1", values=[1, 2], mutable=True) + assert isinstance(m._dim_lengths["fixed"], TensorConstant) + assert isinstance(m._dim_lengths["mutable1"], ScalarSharedVariable) + pm.MutableData("mdata", np.ones((1, 2, 3)), dims=("fixed", "mutable1", "mutable2")) + assert isinstance(m._dim_lengths["mutable2"], TensorVariable) + + @pytest.mark.parametrize("jacobian", [True, False]) def test_model_logp(jacobian): with pm.Model() as m: