From 96f321b8dcdddf91947ab0d8d69030acce40b00f Mon Sep 17 00:00:00 2001
From: Paul Brown <pdb5627@users.noreply.github.com>
Date: Sun, 30 Apr 2023 11:24:34 +0300
Subject: [PATCH 1/5] Initial draft of version of model_builder to work with
 scikit-learn

---
 .../bayesian_estimator_linearmodel.py         | 450 ++++++++++++++++++
 pymc_experimental/model_builder.py            |  42 +-
 .../test_bayesian_estimator_linearmodel.py    | 170 +++++++
 3 files changed, 650 insertions(+), 12 deletions(-)
 create mode 100644 pymc_experimental/bayesian_estimator_linearmodel.py
 create mode 100644 pymc_experimental/tests/test_bayesian_estimator_linearmodel.py

diff --git a/pymc_experimental/bayesian_estimator_linearmodel.py b/pymc_experimental/bayesian_estimator_linearmodel.py
new file mode 100644
index 000000000..6c85130fc
--- /dev/null
+++ b/pymc_experimental/bayesian_estimator_linearmodel.py
@@ -0,0 +1,450 @@
+import json
+from abc import abstractmethod
+from pathlib import Path
+from typing import Any, Dict, Union
+
+import arviz as az
+import numpy as np
+import pandas as pd
+import pymc as pm
+import xarray as xr
+from pymc.util import RandomState
+
+from .model_builder import ModelBuilder
+
+# If scikit-learn is available, use its data validator
+try:
+    from sklearn.utils.validation import check_X_y, check_array
+# If scikit-learn is not available, return the data unchanged
+except ImportError:
+    def check_X_y(X, y, **kwargs):
+        return X, y
+
+    def check_array(X, **kwargs):
+        return X
+
+
+
+class BayesianEstimator(ModelBuilder):
+    """
+    Base class similar to ModelBuilder but customized for integration with a scikit-learn workflow.
+    It is designed to encapsulate parameter inference ("fit") and posterior prediction ("predict")
+    for simple models with the following characteristics:
+     - One or more input features for each observation
+     - One observed output feature for each observation
+
+    Estimators derived from this base class can utilize scikit-learn transformers for input and
+    output accessed via `fit` and `predict`. (`TransformedTargetRegressor` would need to be extended
+    in order to transform the output of `predict_proba` or `predict_posterior`.)
+
+    Example scikit-learn usage:
+    >>> from sklearn.pipeline import Pipeline
+    >>> from sklearn.compose import TransformedTargetRegressor
+    >>> model = Pipeline([
+    >>>     ('input_scaling', StandardScaler()),
+    >>>     ('linear_model', TransformedTargetRegressor(LinearModel(model_config),
+    >>>                                                 transformer=StandardScaler()),)
+    >>> ])
+    >>> model.fit(X_obs, y_obs)
+    >>> y_pred = model.predict(X_pred)
+
+    The format for probabilistic output forecasts is currently an xarray `DataSet` of the posterior
+    prediction draws for each input prediction. This
+
+    The `sklearn` package is not a dependency for using this class, although if imports from `sklearn`
+    are successful, then scikit-learn's data validation functions are used to accept "array-like" input.
+    """
+    model_type = "BaseClass"
+    version = "None"
+
+    def __init__(
+        self,
+        model_config: Dict,
+        sampler_config: Dict,
+    ):
+        """
+        Initializes model configuration and sampler configuration for the model
+
+        Parameters
+        ----------
+        model_config : dict
+            Parameters that initialise model configuration
+        sampler_config : dict
+            Parameters that initialise sampler configuration
+        """
+        self.model_config = model_config  # parameters for priors etc.
+        self.sampler_config = sampler_config  # parameters for sampling
+        self.model = None
+        self.idata = (
+            None  # inference data object placeholder, idata is generated during fitting
+        )
+        self.is_fitted_ = False
+
+    def _validate_data(self, X, y=None):
+        if y is not None:
+            return check_X_y(X, y, accept_sparse=False, y_numeric=True, multi_output=False)
+        else:
+            return check_array(X, accept_sparse=False)
+
+    @abstractmethod
+    def build_model(self) -> None:
+        """
+        Build the PYMC model. The model is built with placeholder data.
+        Actual data will be set by _data_setter when fitting or evaluating the model.
+        Data array size can change but number of dimensions must stay the same.
+
+        Returns:
+        ----------
+        None
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def _data_setter(self, X, y=None):
+        """
+        Sets new data in the model.
+
+        Parameters
+        ----------
+        X : array, shape (n_obs, n_features)
+            The training input samples.
+        y : array, shape (n_obs,)
+            The target values (real numbers).
+
+        Returns:
+        ----------
+        None
+
+        """
+
+        raise NotImplementedError
+    
+
+    def fit(
+        self,
+        X: Union[np.ndarray, pd.DataFrame, pd.Series],
+        y: Union[np.ndarray, pd.DataFrame, pd.Series],
+        progressbar: bool = True,
+        random_seed: RandomState = None,
+        **kwargs: Any,
+    ) -> 'BayesianEstimator':
+        """
+        Fit a model using the data passed as a parameter.
+        Sets attrs to inference data of the model.
+
+        
+        Parameters
+        ----------
+        X : array-like if sklearn is available, otherwise array, shape (n_obs, n_features)
+            The training input samples.
+        y : array-like if sklearn is available, otherwise array, shape (n_obs,)
+            The target values (real numbers).
+        progressbar : bool
+            Specifies whether the fit progressbar should be displayed
+        random_seed : RandomState
+            Provides sampler with initial random seed for obtaining reproducible samples
+        **kwargs : Any
+            Custom sampler settings can be provided in form of keyword arguments.
+        
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+
+        X, y = self._validate_data(X, y)
+
+        self.build_model()
+        self._data_setter(X, y)
+
+        sampler_config = self.sampler_config.copy()
+        sampler_config["progressbar"] = progressbar
+        sampler_config["random_seed"] = random_seed
+        sampler_config.update(**kwargs)
+
+        self.idata = self.sample_model(**sampler_config)
+        return self
+
+    def predict(
+        self,
+        X_pred: Union[np.ndarray, pd.DataFrame, pd.Series],
+        extend_idata: bool = True,
+    ) -> np.ndarray:
+        """
+        Uses model to predict on unseen data and return point prediction of all the samples. The point prediction
+        for each input row is the expected output value, computed as the mean of MCMC samples.
+
+        Parameters
+        ---------
+        X_pred : array-like if sklearn is available, otherwise array, shape (n_pred, n_features)
+            The input data used for prediction.
+        extend_idata : Boolean determining whether the predictions should be added to inference data object.
+            Defaults to True.
+
+        Returns
+        -------
+        y_pred : ndarray, shape (n_pred,)
+            Predicted output corresponding to input X_pred.
+        """
+        if not hasattr(self, 'output_var'):
+            raise NotImplementedError(f'Subclasses of {__class__} should set self.output_var')
+        
+        X_pred = self._validate_data(X_pred)
+
+        posterior_predictive_samples = self.sample_posterior_predictive(X_pred, extend_idata, combined=False)
+
+        if self.output_var not in posterior_predictive_samples:
+            raise KeyError(f'Output variable {self.output_var} not found in posterior predictive samples.')
+
+        posterior_means = posterior_predictive_samples[self.output_var].mean(dim=["chain", "draw"], keep_attrs=False)
+        return posterior_means.data
+
+    def predict_posterior(
+        self,
+        X_pred: Union[np.ndarray, pd.DataFrame, pd.Series],
+        extend_idata: bool = True,
+        combined: bool = True,
+    ) -> xr.DataArray:
+        """
+        Generate posterior predictive samples on unseen data.
+
+        Parameters
+        ---------
+        X_pred : array-like if sklearn is available, otherwise array, shape (n_pred, n_features)
+            The input data used for prediction.
+        extend_idata : Boolean determining whether the predictions should be added to inference data object.
+            Defaults to True.
+        combined: Combine chain and draw dims into sample. Won't work if a dim named sample already exists.
+            Defaults to True.
+
+        Returns
+        -------
+        y_pred : DataArray, shape (n_pred, chains * draws) if combined is True, otherwise (chains, draws, n_pred)
+            Posterior predictive samples for each input X_pred
+        """
+        if not hasattr(self, 'output_var'):
+            raise NotImplementedError(f'Subclasses of {__class__} should set self.output_var')
+
+        X_pred = self._validate_data(X_pred)
+        posterior_predictive_samples = self.sample_posterior_predictive(X_pred, extend_idata, combined)
+
+        if self.output_var not in posterior_predictive_samples:
+            raise KeyError(f'Output variable {self.output_var} not found in posterior predictive samples.')  
+
+        return posterior_predictive_samples[self.output_var]
+
+    def predict_proba(
+            self,
+            X_pred: Union[np.ndarray, pd.DataFrame, pd.Series],
+            extend_idata: bool = True,
+            combined: bool = False,
+    ) -> xr.DataArray:
+        """ Alias for `predict_posterior`, for consistency with scikit-learn probabilistic estimators. """
+        return self.predict_posterior(self, X_pred, extend_idata, combined)
+
+    def sample_prior_predictive(self,
+        X_pred, 
+        samples: int = None,
+        extend_idata: bool = False,
+        combined: bool = True
+    ):
+        """
+        Sample from the model's prior predictive distribution.
+
+        Parameters
+        ---------
+        X_pred : array, shape (n_pred, n_features)
+            The input data used for prediction using prior distribution.
+        samples : int
+            Number of samples from the prior parameter distributions to generate.
+            If not set, uses sampler_config['draws'] if that is available, otherwise defaults to 500.
+        extend_idata : Boolean determining whether the predictions should be added to inference data object.
+            Defaults to False.
+        combined: Combine chain and draw dims into sample. Won't work if a dim named sample already exists.
+            Defaults to True.
+
+        Returns
+        -------
+        prior_predictive_samples : DataArray, shape (n_pred, samples)
+            Prior predictive samples for each input X_pred
+        """
+        if samples is None:
+            samples = self.sampler_config.get('draws', 500)
+
+        if self.model is None:
+            self.build_model()
+
+        self._data_setter(X_pred)
+
+        with self.model:  # sample with new input data
+            prior_pred = pm.sample_prior_predictive(samples)
+            self.set_idata_attrs(prior_pred)
+            if extend_idata:
+                if self.idata is not None:
+                    self.idata.extend(prior_pred)
+                else:
+                    self.idata = prior_pred
+
+        prior_predictive_samples = az.extract(
+            prior_pred, "prior_predictive", combined=combined
+        )
+        print('prior_pred = ')
+        print(prior_pred)
+        print('prior_predictive_samples = ')
+        print(prior_predictive_samples)
+
+        return prior_predictive_samples
+
+    def sample_posterior_predictive(self, X_pred, extend_idata, combined):
+        """
+        Sample from the model's posterior predictive distribution.
+
+        Parameters
+        ---------
+        X_pred : array, shape (n_pred, n_features)
+            The input data used for prediction using prior distribution..
+        extend_idata : Boolean determining whether the predictions should be added to inference data object.
+            Defaults to False.
+        combined: Combine chain and draw dims into sample. Won't work if a dim named sample already exists.
+            Defaults to True.
+
+        Returns
+        -------
+        posterior_predictive_samples : DataArray, shape (n_pred, samples)
+            Posterior predictive samples for each input X_pred
+        """
+        self._data_setter(X_pred)
+
+        with self.model:  # sample with new input data
+            post_pred = pm.sample_posterior_predictive(self.idata)
+            if extend_idata:
+                self.idata.extend(post_pred)
+
+        posterior_predictive_samples = az.extract(
+            post_pred, "posterior_predictive", combined=combined
+        )
+        
+        return posterior_predictive_samples
+
+    @classmethod
+    def load(cls, fname: str):
+        """
+        Creates a BayesianEstimator instance from a file,
+        Loads inference data for the model.
+
+        Parameters
+        ----------
+        fname : string
+            This denotes the name with path from where idata should be loaded from.
+
+        Returns
+        -------
+        Returns an instance of BayesianEstimator.
+
+        Raises
+        ------
+        ValueError
+            If the inference data that is loaded doesn't match with the model.
+        """
+
+        filepath = Path(str(fname))
+        idata = az.from_netcdf(filepath)
+        model = cls(
+            model_config=json.loads(idata.attrs["model_config"]),
+            sampler_config=json.loads(idata.attrs["sampler_config"]),
+        )
+        model.idata = idata
+        model.build_model()
+        # All previously used data is in idata.
+
+        if model.id != idata.attrs["id"]:
+            raise ValueError(
+                f"The file '{fname}' does not contain an inference data of the same model or configuration as '{cls._model_type}'"
+            )
+
+        return model
+
+    @property
+    def _serializable_model_config(self):
+        return self.model_config
+
+    def get_params(self, deep=True):
+        """
+        Get all the model parameters needed to instantiate a copy of the model, not including training data.
+        """
+        return {'model_config': self.model_config,
+                'sampler_config': self.sampler_config}
+
+    def set_params(self, **params):
+        """
+        Set all the model parameters needed to instantiate the model, not including training data.
+        """
+        self.model_config = params['model_config']
+        self.sampler_config = params['sampler_config']
+
+
+class LinearModel(BayesianEstimator):
+    """
+    This class is an implementation of a single-input linear regression model in PYMC using the
+    BayesianEstimator base class for interoperability with scikit-learn.
+    """
+    _model_type = "LinearModel"
+    version = "0.1"
+
+    def __init__(
+            self,
+            model_config={
+                "intercept": {"loc": 0, "scale": 10},
+                "slope": {"loc": 0, "scale": 10},
+                "obs_error": 2,
+                "default_output_var": "y_hat",
+            },
+            sampler_config={
+                "draws": 1_000,
+                "tune": 1_000,
+                "chains": 3,
+                "target_accept": 0.95,
+            },
+    ):
+        super().__init__(model_config, sampler_config)
+        self.output_var = 'y_hat'
+
+    def build_model(self):
+        cfg = self.model_config
+
+        # The model is built with placeholder data.
+        # Actual data will be set by _data_setter when fitting or evaluating the model.
+        # Data array size can change but number of dimensions must stay the same.
+        with pm.Model() as self.model:
+            x = pm.MutableData("x", np.zeros((1,)))
+            y_data = pm.MutableData("y_data", np.zeros((1,)))
+
+            # priors
+            intercept = pm.Normal("intercept",
+                                  cfg["intercept"]["loc"],
+                                  sigma=cfg["intercept"]["scale"])
+            slope = pm.Normal("slope",
+                              cfg["slope"]["loc"],
+                              sigma=cfg["slope"]["scale"])
+            obs_error = pm.HalfNormal("σ_model_fmc", cfg["obs_error"])
+
+            # Model
+            y_model = pm.Deterministic('y_model', intercept + slope * x)
+
+            # observed data
+            y_hat = pm.Normal("y_hat", y_model + obs_error, shape=x.shape, observed=y_data)
+
+    def _data_setter(self, X, y=None):
+        with self.model:
+            pm.set_data({"x": X.squeeze()})
+            if y is not None:
+                pm.set_data({"y_data": y.squeeze()})
+
+    @classmethod
+    def create_sample_input(cls, nsamples=100):
+        x = np.linspace(start=0, stop=1, num=nsamples)
+        y = 5 * x + 3
+        y = y + np.random.normal(0, 1, len(x))
+
+        x = np.expand_dims(x, -1)  # scikit assumes a dimension for features.
+        return x, y
diff --git a/pymc_experimental/model_builder.py b/pymc_experimental/model_builder.py
index 096109ee8..48f168d8d 100644
--- a/pymc_experimental/model_builder.py
+++ b/pymc_experimental/model_builder.py
@@ -176,6 +176,33 @@ def build_model(
         """
         raise NotImplementedError
 
+    def sample_model(self, **kwargs):
+
+        if self.model is None:
+            raise RuntimeError(
+                "The model hasn't been built yet, call .build_model() first or call .fit() instead."
+            )
+
+        with self.model:
+            sampler_args = {**self.sampler_config, **kwargs}
+            idata = pm.sample(**sampler_args)
+            idata.extend(pm.sample_prior_predictive())
+            idata.extend(pm.sample_posterior_predictive(idata))
+
+        self.set_idata_attrs(idata)
+        return idata
+
+    def set_idata_attrs(self, idata=None):
+        if idata is None:
+            idata = self.idata
+        if idata is None:
+            raise RuntimeError("No idata provided to set attrs on.")
+        idata.attrs["id"] = self.id
+        idata.attrs["model_type"] = self._model_type
+        idata.attrs["version"] = self.version
+        idata.attrs["sampler_config"] = json.dumps(self.sampler_config)
+        idata.attrs["model_config"] = json.dumps(self._serializable_model_config)
+
     def save(self, fname: str) -> None:
         """
         Saves inference data of the model.
@@ -195,7 +222,7 @@ def save(self, fname: str) -> None:
         >>> name = './mymodel.nc'
         >>> model.save(name)
         """
-        if self.idata is not None and "fit_data" in self.idata:
+        if self.idata is not None and "posterior" in self.idata:
             file = Path(str(fname))
             self.idata.to_netcdf(file)
         else:
@@ -232,9 +259,9 @@ def load(cls, fname: str):
         filepath = Path(str(fname))
         idata = az.from_netcdf(filepath)
         model_builder = cls(
+            data=idata.fit_data.to_dataframe(),
             model_config=json.loads(idata.attrs["model_config"]),
             sampler_config=json.loads(idata.attrs["sampler_config"]),
-            data=idata.fit_data.to_dataframe(),
         )
         model_builder.idata = idata
         model_builder.build_model(model_builder.data, model_builder.model_config)
@@ -294,16 +321,7 @@ def fit(
         sampler_config["progressbar"] = progressbar
         sampler_config["random_seed"] = random_seed
 
-        with self.model:
-            self.idata = pm.sample(**self.sampler_config, **kwargs)
-            self.idata.extend(pm.sample_prior_predictive())
-            self.idata.extend(pm.sample_posterior_predictive(self.idata))
-
-        self.idata.attrs["id"] = self.id
-        self.idata.attrs["model_type"] = self._model_type
-        self.idata.attrs["version"] = self.version
-        self.idata.attrs["sampler_config"] = json.dumps(self.sampler_config)
-        self.idata.attrs["model_config"] = json.dumps(self._serializable_model_config)
+        self.idata = self.sample_model(**sampler_config)
         self.idata.add_groups(fit_data=self.data.to_xarray())
         return self.idata
 
diff --git a/pymc_experimental/tests/test_bayesian_estimator_linearmodel.py b/pymc_experimental/tests/test_bayesian_estimator_linearmodel.py
new file mode 100644
index 000000000..7bd4c167f
--- /dev/null
+++ b/pymc_experimental/tests/test_bayesian_estimator_linearmodel.py
@@ -0,0 +1,170 @@
+
+#   Copyright 2023 The PyMC Developers
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+import hashlib
+import sys
+import tempfile
+
+import numpy as np
+import xarray as xr
+import pytest
+
+from pymc_experimental.bayesian_estimator_linearmodel import LinearModel
+
+try:
+    from sklearn.pipeline import Pipeline
+    from sklearn.compose import TransformedTargetRegressor
+    from sklearn.preprocessing import StandardScaler
+    sklearn_available = True
+except ImportError:
+    sklearn_available = False
+
+
+
+@pytest.fixture(scope="module")
+def sample_input():
+    x, y = LinearModel.create_sample_input()
+    return x, y
+
+
+@pytest.fixture(scope="module")
+def fitted_linear_model_instance(sample_input):
+    sampler_config={
+        "draws": 500,
+        "tune": 300,
+        "chains": 2,
+        "target_accept": 0.95,
+    }
+    x, y = sample_input
+    model = LinearModel()
+    model.fit(x, y)
+    return model
+
+
+def test_save_without_fit_raises_runtime_error():
+    x, y = LinearModel.create_sample_input()
+    test_model = LinearModel()
+    with pytest.raises(RuntimeError):
+        test_model.save("saved_model")
+
+
+def test_fit(fitted_linear_model_instance):
+    model = fitted_linear_model_instance
+
+    x_pred = np.random.uniform(low=0, high=1, size=(100, 1))
+    pred = model.predict(x_pred)
+    assert len(x_pred) == len(pred)
+    assert isinstance(pred, np.ndarray)
+    post_pred = model.predict_posterior(x_pred)
+    assert len(x_pred) == len(post_pred)
+    assert isinstance(post_pred, xr.DataArray)
+
+
+@pytest.mark.skipif(
+    sys.platform == "win32", reason="Permissions for temp files not granted on windows CI."
+)
+def test_save_load(fitted_linear_model_instance):
+    model = fitted_linear_model_instance
+    temp = tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False)
+    model.save(temp.name)
+    model2 = LinearModel.load(temp.name)
+    assert model.idata.groups() == model2.idata.groups()
+
+    x_pred = np.random.uniform(low=0, high=1, size=(100, 1))
+    pred1 = model.predict(x_pred)
+    pred2 = model2.predict(x_pred)
+    # Predictions should have similar statistical characteristics
+    assert pred1.mean() == pytest.approx(pred2.mean(), 1e-3)
+    assert pred1.var() == pytest.approx(pred2.var(), 1e-2)
+    temp.close()
+
+
+def test_predict(fitted_linear_model_instance):
+    model = fitted_linear_model_instance
+    x_pred = np.random.uniform(low=0, high=1, size=(100, 1))
+    pred = model.predict(x_pred)
+    assert len(x_pred) == len(pred)
+    assert np.issubdtype(pred.dtype, np.floating)
+
+
+@pytest.mark.parametrize("combined", [True, False])
+def test_predict_posterior(fitted_linear_model_instance, combined):
+    model = fitted_linear_model_instance
+    n_pred = 150
+    x_pred = np.random.uniform(low=0, high=1, size=(n_pred, 1))
+    pred = model.predict_posterior(x_pred, combined=combined)
+    chains = model.idata.sample_stats.dims["chain"]
+    draws = model.idata.sample_stats.dims["draw"]
+    expected_shape = (n_pred, chains * draws) if combined else (chains, draws, n_pred)
+    assert pred.shape == expected_shape
+    assert np.issubdtype(pred.dtype, np.floating)
+    # TODO: check that extend_idata has the expected effect
+
+
+@pytest.mark.parametrize("samples", [None, 300])
+@pytest.mark.parametrize("combined", [True, False])
+def test_sample_prior_predictive(samples, combined, sample_input):
+    x, y = sample_input
+    model = LinearModel()
+    prior_pred = model.sample_prior_predictive(x, samples, combined=combined)[model.output_var]
+    draws = model.sampler_config['draws'] if samples is None else samples
+    chains = 1
+    expected_shape = (len(x), chains * draws) if combined else (chains, draws, len(x))
+    assert prior_pred.shape == expected_shape
+    # TODO: check that extend_idata has the expected effect
+    
+
+def test_id(sample_input):
+    x, y = sample_input
+    model_config = {
+        "intercept": {"loc": 0, "scale": 10},
+        "slope": {"loc": 0, "scale": 10},
+        "obs_error": 2,
+    }
+    sampler_config={
+        "draws": 1_000,
+        "tune": 1_000,
+        "chains": 3,
+        "target_accept": 0.95,
+    }
+    model = LinearModel(model_config=model_config, sampler_config=sampler_config)
+
+    expected_id = hashlib.sha256(
+        str(model_config.values()).encode() + model.version.encode() + model._model_type.encode()
+    ).hexdigest()[:16]
+
+    assert model.id == expected_id
+
+
+@pytest.mark.skipif(
+    not sklearn_available, reason="scikit-learn package is not available."
+)
+def test_pipeline_integration(sample_input):
+    model_config = {
+        "intercept": {"loc": 0, "scale": 2},
+        "slope": {"loc": 0, "scale": 2},
+        "obs_error": 1,
+        "default_output_var": "y_hat",
+    }
+    model = Pipeline([
+        ('input_scaling', StandardScaler()),
+        ('linear_model', TransformedTargetRegressor(LinearModel(model_config),
+                                                    transformer=StandardScaler()),)
+    ])
+    x, y = sample_input
+    model.fit(x, y)
+
+    x_pred = np.random.uniform(low=0, high=1, size=(100, 1))
+    model.predict(x_pred)

From 8156ad41f98b075c0979fc1691b7786321b76b80 Mon Sep 17 00:00:00 2001
From: Paul Brown <pdb5627@users.noreply.github.com>
Date: Sun, 30 Apr 2023 19:51:23 +0300
Subject: [PATCH 2/5] Run pre-commit checks and eliminate mutable default args
 to LinearModel.__init__

---
 .../bayesian_estimator_linearmodel.py         | 115 +++++++++---------
 .../test_bayesian_estimator_linearmodel.py    |  33 ++---
 2 files changed, 75 insertions(+), 73 deletions(-)

diff --git a/pymc_experimental/bayesian_estimator_linearmodel.py b/pymc_experimental/bayesian_estimator_linearmodel.py
index 6c85130fc..fdcd8e8dd 100644
--- a/pymc_experimental/bayesian_estimator_linearmodel.py
+++ b/pymc_experimental/bayesian_estimator_linearmodel.py
@@ -10,13 +10,14 @@
 import xarray as xr
 from pymc.util import RandomState
 
-from .model_builder import ModelBuilder
+from pymc_experimental.model_builder import ModelBuilder
 
 # If scikit-learn is available, use its data validator
 try:
-    from sklearn.utils.validation import check_X_y, check_array
+    from sklearn.utils.validation import check_array, check_X_y
 # If scikit-learn is not available, return the data unchanged
 except ImportError:
+
     def check_X_y(X, y, **kwargs):
         return X, y
 
@@ -24,7 +25,6 @@ def check_array(X, **kwargs):
         return X
 
 
-
 class BayesianEstimator(ModelBuilder):
     """
     Base class similar to ModelBuilder but customized for integration with a scikit-learn workflow.
@@ -54,6 +54,7 @@ class BayesianEstimator(ModelBuilder):
     The `sklearn` package is not a dependency for using this class, although if imports from `sklearn`
     are successful, then scikit-learn's data validation functions are used to accept "array-like" input.
     """
+
     model_type = "BaseClass"
     version = "None"
 
@@ -75,9 +76,7 @@ def __init__(
         self.model_config = model_config  # parameters for priors etc.
         self.sampler_config = sampler_config  # parameters for sampling
         self.model = None
-        self.idata = (
-            None  # inference data object placeholder, idata is generated during fitting
-        )
+        self.idata = None  # inference data object placeholder, idata is generated during fitting
         self.is_fitted_ = False
 
     def _validate_data(self, X, y=None):
@@ -118,7 +117,6 @@ def _data_setter(self, X, y=None):
         """
 
         raise NotImplementedError
-    
 
     def fit(
         self,
@@ -127,12 +125,12 @@ def fit(
         progressbar: bool = True,
         random_seed: RandomState = None,
         **kwargs: Any,
-    ) -> 'BayesianEstimator':
+    ) -> "BayesianEstimator":
         """
         Fit a model using the data passed as a parameter.
         Sets attrs to inference data of the model.
 
-        
+
         Parameters
         ----------
         X : array-like if sklearn is available, otherwise array, shape (n_obs, n_features)
@@ -145,7 +143,7 @@ def fit(
             Provides sampler with initial random seed for obtaining reproducible samples
         **kwargs : Any
             Custom sampler settings can be provided in form of keyword arguments.
-        
+
         Returns
         -------
         self : object
@@ -186,17 +184,23 @@ def predict(
         y_pred : ndarray, shape (n_pred,)
             Predicted output corresponding to input X_pred.
         """
-        if not hasattr(self, 'output_var'):
-            raise NotImplementedError(f'Subclasses of {__class__} should set self.output_var')
-        
+        if not hasattr(self, "output_var"):
+            raise NotImplementedError(f"Subclasses of {__class__} should set self.output_var")
+
         X_pred = self._validate_data(X_pred)
 
-        posterior_predictive_samples = self.sample_posterior_predictive(X_pred, extend_idata, combined=False)
+        posterior_predictive_samples = self.sample_posterior_predictive(
+            X_pred, extend_idata, combined=False
+        )
 
         if self.output_var not in posterior_predictive_samples:
-            raise KeyError(f'Output variable {self.output_var} not found in posterior predictive samples.')
+            raise KeyError(
+                f"Output variable {self.output_var} not found in posterior predictive samples."
+            )
 
-        posterior_means = posterior_predictive_samples[self.output_var].mean(dim=["chain", "draw"], keep_attrs=False)
+        posterior_means = posterior_predictive_samples[self.output_var].mean(
+            dim=["chain", "draw"], keep_attrs=False
+        )
         return posterior_means.data
 
     def predict_posterior(
@@ -222,31 +226,32 @@ def predict_posterior(
         y_pred : DataArray, shape (n_pred, chains * draws) if combined is True, otherwise (chains, draws, n_pred)
             Posterior predictive samples for each input X_pred
         """
-        if not hasattr(self, 'output_var'):
-            raise NotImplementedError(f'Subclasses of {__class__} should set self.output_var')
+        if not hasattr(self, "output_var"):
+            raise NotImplementedError(f"Subclasses of {__class__} should set self.output_var")
 
         X_pred = self._validate_data(X_pred)
-        posterior_predictive_samples = self.sample_posterior_predictive(X_pred, extend_idata, combined)
+        posterior_predictive_samples = self.sample_posterior_predictive(
+            X_pred, extend_idata, combined
+        )
 
         if self.output_var not in posterior_predictive_samples:
-            raise KeyError(f'Output variable {self.output_var} not found in posterior predictive samples.')  
+            raise KeyError(
+                f"Output variable {self.output_var} not found in posterior predictive samples."
+            )
 
         return posterior_predictive_samples[self.output_var]
 
     def predict_proba(
-            self,
-            X_pred: Union[np.ndarray, pd.DataFrame, pd.Series],
-            extend_idata: bool = True,
-            combined: bool = False,
+        self,
+        X_pred: Union[np.ndarray, pd.DataFrame, pd.Series],
+        extend_idata: bool = True,
+        combined: bool = False,
     ) -> xr.DataArray:
-        """ Alias for `predict_posterior`, for consistency with scikit-learn probabilistic estimators. """
+        """Alias for `predict_posterior`, for consistency with scikit-learn probabilistic estimators."""
         return self.predict_posterior(self, X_pred, extend_idata, combined)
 
-    def sample_prior_predictive(self,
-        X_pred, 
-        samples: int = None,
-        extend_idata: bool = False,
-        combined: bool = True
+    def sample_prior_predictive(
+        self, X_pred, samples: int = None, extend_idata: bool = False, combined: bool = True
     ):
         """
         Sample from the model's prior predictive distribution.
@@ -269,7 +274,7 @@ def sample_prior_predictive(self,
             Prior predictive samples for each input X_pred
         """
         if samples is None:
-            samples = self.sampler_config.get('draws', 500)
+            samples = self.sampler_config.get("draws", 500)
 
         if self.model is None:
             self.build_model()
@@ -285,13 +290,7 @@ def sample_prior_predictive(self,
                 else:
                     self.idata = prior_pred
 
-        prior_predictive_samples = az.extract(
-            prior_pred, "prior_predictive", combined=combined
-        )
-        print('prior_pred = ')
-        print(prior_pred)
-        print('prior_predictive_samples = ')
-        print(prior_predictive_samples)
+        prior_predictive_samples = az.extract(prior_pred, "prior_predictive", combined=combined)
 
         return prior_predictive_samples
 
@@ -323,7 +322,7 @@ def sample_posterior_predictive(self, X_pred, extend_idata, combined):
         posterior_predictive_samples = az.extract(
             post_pred, "posterior_predictive", combined=combined
         )
-        
+
         return posterior_predictive_samples
 
     @classmethod
@@ -372,15 +371,14 @@ def get_params(self, deep=True):
         """
         Get all the model parameters needed to instantiate a copy of the model, not including training data.
         """
-        return {'model_config': self.model_config,
-                'sampler_config': self.sampler_config}
+        return {"model_config": self.model_config, "sampler_config": self.sampler_config}
 
     def set_params(self, **params):
         """
         Set all the model parameters needed to instantiate the model, not including training data.
         """
-        self.model_config = params['model_config']
-        self.sampler_config = params['sampler_config']
+        self.model_config = params["model_config"]
+        self.sampler_config = params["sampler_config"]
 
 
 class LinearModel(BayesianEstimator):
@@ -388,26 +386,31 @@ class LinearModel(BayesianEstimator):
     This class is an implementation of a single-input linear regression model in PYMC using the
     BayesianEstimator base class for interoperability with scikit-learn.
     """
+
     _model_type = "LinearModel"
     version = "0.1"
 
     def __init__(
-            self,
-            model_config={
+        self,
+        model_config: Dict = None,
+        sampler_config: Dict = None,
+    ):
+        if model_config is None:
+            model_config = {
                 "intercept": {"loc": 0, "scale": 10},
                 "slope": {"loc": 0, "scale": 10},
                 "obs_error": 2,
                 "default_output_var": "y_hat",
-            },
-            sampler_config={
+            }
+        if sampler_config is None:
+            sampler_config = {
                 "draws": 1_000,
                 "tune": 1_000,
                 "chains": 3,
                 "target_accept": 0.95,
-            },
-    ):
+            }
         super().__init__(model_config, sampler_config)
-        self.output_var = 'y_hat'
+        self.output_var = "y_hat"
 
     def build_model(self):
         cfg = self.model_config
@@ -420,16 +423,14 @@ def build_model(self):
             y_data = pm.MutableData("y_data", np.zeros((1,)))
 
             # priors
-            intercept = pm.Normal("intercept",
-                                  cfg["intercept"]["loc"],
-                                  sigma=cfg["intercept"]["scale"])
-            slope = pm.Normal("slope",
-                              cfg["slope"]["loc"],
-                              sigma=cfg["slope"]["scale"])
+            intercept = pm.Normal(
+                "intercept", cfg["intercept"]["loc"], sigma=cfg["intercept"]["scale"]
+            )
+            slope = pm.Normal("slope", cfg["slope"]["loc"], sigma=cfg["slope"]["scale"])
             obs_error = pm.HalfNormal("σ_model_fmc", cfg["obs_error"])
 
             # Model
-            y_model = pm.Deterministic('y_model', intercept + slope * x)
+            y_model = pm.Deterministic("y_model", intercept + slope * x)
 
             # observed data
             y_hat = pm.Normal("y_hat", y_model + obs_error, shape=x.shape, observed=y_data)
diff --git a/pymc_experimental/tests/test_bayesian_estimator_linearmodel.py b/pymc_experimental/tests/test_bayesian_estimator_linearmodel.py
index 7bd4c167f..9fc9b5754 100644
--- a/pymc_experimental/tests/test_bayesian_estimator_linearmodel.py
+++ b/pymc_experimental/tests/test_bayesian_estimator_linearmodel.py
@@ -1,4 +1,3 @@
-
 #   Copyright 2023 The PyMC Developers
 #
 #   Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,21 +17,21 @@
 import tempfile
 
 import numpy as np
-import xarray as xr
 import pytest
+import xarray as xr
 
 from pymc_experimental.bayesian_estimator_linearmodel import LinearModel
 
 try:
-    from sklearn.pipeline import Pipeline
     from sklearn.compose import TransformedTargetRegressor
+    from sklearn.pipeline import Pipeline
     from sklearn.preprocessing import StandardScaler
+
     sklearn_available = True
 except ImportError:
     sklearn_available = False
 
 
-
 @pytest.fixture(scope="module")
 def sample_input():
     x, y = LinearModel.create_sample_input()
@@ -41,7 +40,7 @@ def sample_input():
 
 @pytest.fixture(scope="module")
 def fitted_linear_model_instance(sample_input):
-    sampler_config={
+    sampler_config = {
         "draws": 500,
         "tune": 300,
         "chains": 2,
@@ -119,12 +118,12 @@ def test_sample_prior_predictive(samples, combined, sample_input):
     x, y = sample_input
     model = LinearModel()
     prior_pred = model.sample_prior_predictive(x, samples, combined=combined)[model.output_var]
-    draws = model.sampler_config['draws'] if samples is None else samples
+    draws = model.sampler_config["draws"] if samples is None else samples
     chains = 1
     expected_shape = (len(x), chains * draws) if combined else (chains, draws, len(x))
     assert prior_pred.shape == expected_shape
     # TODO: check that extend_idata has the expected effect
-    
+
 
 def test_id(sample_input):
     x, y = sample_input
@@ -133,7 +132,7 @@ def test_id(sample_input):
         "slope": {"loc": 0, "scale": 10},
         "obs_error": 2,
     }
-    sampler_config={
+    sampler_config = {
         "draws": 1_000,
         "tune": 1_000,
         "chains": 3,
@@ -148,9 +147,7 @@ def test_id(sample_input):
     assert model.id == expected_id
 
 
-@pytest.mark.skipif(
-    not sklearn_available, reason="scikit-learn package is not available."
-)
+@pytest.mark.skipif(not sklearn_available, reason="scikit-learn package is not available.")
 def test_pipeline_integration(sample_input):
     model_config = {
         "intercept": {"loc": 0, "scale": 2},
@@ -158,11 +155,15 @@ def test_pipeline_integration(sample_input):
         "obs_error": 1,
         "default_output_var": "y_hat",
     }
-    model = Pipeline([
-        ('input_scaling', StandardScaler()),
-        ('linear_model', TransformedTargetRegressor(LinearModel(model_config),
-                                                    transformer=StandardScaler()),)
-    ])
+    model = Pipeline(
+        [
+            ("input_scaling", StandardScaler()),
+            (
+                "linear_model",
+                TransformedTargetRegressor(LinearModel(model_config), transformer=StandardScaler()),
+            ),
+        ]
+    )
     x, y = sample_input
     model.fit(x, y)
 

From abc9d1995ba516adbbf289baef756c882e9467fd Mon Sep 17 00:00:00 2001
From: Paul Brown <pdb5627@users.noreply.github.com>
Date: Mon, 1 May 2023 12:50:53 +0300
Subject: [PATCH 3/5] Move default constructor parameters to properties

This avoids issues with mutable objects as default parameters.
It allows users to start from the default parameters and modify parts of
them before passing them to __init__.
---
 .../bayesian_estimator_linearmodel.py         | 67 +++++++++++--------
 1 file changed, 40 insertions(+), 27 deletions(-)

diff --git a/pymc_experimental/bayesian_estimator_linearmodel.py b/pymc_experimental/bayesian_estimator_linearmodel.py
index fdcd8e8dd..40d0db1fd 100644
--- a/pymc_experimental/bayesian_estimator_linearmodel.py
+++ b/pymc_experimental/bayesian_estimator_linearmodel.py
@@ -60,8 +60,8 @@ class BayesianEstimator(ModelBuilder):
 
     def __init__(
         self,
-        model_config: Dict,
-        sampler_config: Dict,
+        model_config: Dict = None,
+        sampler_config: Dict = None,
     ):
         """
         Initializes model configuration and sampler configuration for the model
@@ -73,12 +73,29 @@ def __init__(
         sampler_config : dict
             Parameters that initialise sampler configuration
         """
-        self.model_config = model_config  # parameters for priors etc.
-        self.sampler_config = sampler_config  # parameters for sampling
-        self.model = None
-        self.idata = None  # inference data object placeholder, idata is generated during fitting
+        if model_config is None:
+            model_config = self.default_model_config
+        self.model_config = model_config
+
+        if sampler_config is None:
+            sampler_config = self.default_sampler_config
+        self.sampler_config = sampler_config
+
+        self.model = None  # Set by build_model
+        self.output_var = None  # Set by build_model
+        self.idata = None  # idata is generated during fitting
         self.is_fitted_ = False
 
+    @property
+    @abstractmethod
+    def default_model_config(self):
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def default_sampler_config(self):
+        raise NotImplementedError
+
     def _validate_data(self, X, y=None):
         if y is not None:
             return check_X_y(X, y, accept_sparse=False, y_numeric=True, multi_output=False)
@@ -390,27 +407,22 @@ class LinearModel(BayesianEstimator):
     _model_type = "LinearModel"
     version = "0.1"
 
-    def __init__(
-        self,
-        model_config: Dict = None,
-        sampler_config: Dict = None,
-    ):
-        if model_config is None:
-            model_config = {
-                "intercept": {"loc": 0, "scale": 10},
-                "slope": {"loc": 0, "scale": 10},
-                "obs_error": 2,
-                "default_output_var": "y_hat",
-            }
-        if sampler_config is None:
-            sampler_config = {
-                "draws": 1_000,
-                "tune": 1_000,
-                "chains": 3,
-                "target_accept": 0.95,
-            }
-        super().__init__(model_config, sampler_config)
-        self.output_var = "y_hat"
+    @property
+    def default_model_config(self):
+        return {
+            "intercept": {"loc": 0, "scale": 10},
+            "slope": {"loc": 0, "scale": 10},
+            "obs_error": 2,
+        }
+
+    @property
+    def default_sampler_config(self):
+        return {
+            "draws": 1_000,
+            "tune": 1_000,
+            "chains": 3,
+            "target_accept": 0.95,
+        }
 
     def build_model(self):
         cfg = self.model_config
@@ -434,6 +446,7 @@ def build_model(self):
 
             # observed data
             y_hat = pm.Normal("y_hat", y_model + obs_error, shape=x.shape, observed=y_data)
+            self.output_var = "y_hat"
 
     def _data_setter(self, X, y=None):
         with self.model:

From bfe21c21c38f5c0afe2644fd9b9fc84619384f97 Mon Sep 17 00:00:00 2001
From: Paul Brown <pdb5627@users.noreply.github.com>
Date: Fri, 5 May 2023 23:53:39 +0300
Subject: [PATCH 4/5] Add dimension names to model and fix bug in predict_proba

---
 pymc_experimental/bayesian_estimator_linearmodel.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pymc_experimental/bayesian_estimator_linearmodel.py b/pymc_experimental/bayesian_estimator_linearmodel.py
index 40d0db1fd..39ab5e7af 100644
--- a/pymc_experimental/bayesian_estimator_linearmodel.py
+++ b/pymc_experimental/bayesian_estimator_linearmodel.py
@@ -265,7 +265,7 @@ def predict_proba(
         combined: bool = False,
     ) -> xr.DataArray:
         """Alias for `predict_posterior`, for consistency with scikit-learn probabilistic estimators."""
-        return self.predict_posterior(self, X_pred, extend_idata, combined)
+        return self.predict_posterior(X_pred, extend_idata, combined)
 
     def sample_prior_predictive(
         self, X_pred, samples: int = None, extend_idata: bool = False, combined: bool = True
@@ -431,8 +431,8 @@ def build_model(self):
         # Actual data will be set by _data_setter when fitting or evaluating the model.
         # Data array size can change but number of dimensions must stay the same.
         with pm.Model() as self.model:
-            x = pm.MutableData("x", np.zeros((1,)))
-            y_data = pm.MutableData("y_data", np.zeros((1,)))
+            x = pm.MutableData("x", np.zeros((1,)), dims='observation')
+            y_data = pm.MutableData("y_data", np.zeros((1,)), dims='observation')
 
             # priors
             intercept = pm.Normal(
@@ -442,15 +442,15 @@ def build_model(self):
             obs_error = pm.HalfNormal("σ_model_fmc", cfg["obs_error"])
 
             # Model
-            y_model = pm.Deterministic("y_model", intercept + slope * x)
+            y_model = pm.Deterministic("y_model", intercept + slope * x, dims='observation')
 
             # observed data
-            y_hat = pm.Normal("y_hat", y_model + obs_error, shape=x.shape, observed=y_data)
+            y_hat = pm.Normal("y_hat", y_model, sigma=obs_error, shape=x.shape, observed=y_data, dims='observation')
             self.output_var = "y_hat"
 
     def _data_setter(self, X, y=None):
         with self.model:
-            pm.set_data({"x": X.squeeze()})
+            pm.set_data({"x": X[:, 0]})
             if y is not None:
                 pm.set_data({"y_data": y.squeeze()})
 

From 235876c389c76fce3ece21134273d4d4b2b80a35 Mon Sep 17 00:00:00 2001
From: Paul Brown <pdb5627@users.noreply.github.com>
Date: Fri, 5 May 2023 23:56:35 +0300
Subject: [PATCH 5/5] black reformatting

---
 .../bayesian_estimator_linearmodel.py             | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/pymc_experimental/bayesian_estimator_linearmodel.py b/pymc_experimental/bayesian_estimator_linearmodel.py
index 39ab5e7af..4feaf2440 100644
--- a/pymc_experimental/bayesian_estimator_linearmodel.py
+++ b/pymc_experimental/bayesian_estimator_linearmodel.py
@@ -431,8 +431,8 @@ def build_model(self):
         # Actual data will be set by _data_setter when fitting or evaluating the model.
         # Data array size can change but number of dimensions must stay the same.
         with pm.Model() as self.model:
-            x = pm.MutableData("x", np.zeros((1,)), dims='observation')
-            y_data = pm.MutableData("y_data", np.zeros((1,)), dims='observation')
+            x = pm.MutableData("x", np.zeros((1,)), dims="observation")
+            y_data = pm.MutableData("y_data", np.zeros((1,)), dims="observation")
 
             # priors
             intercept = pm.Normal(
@@ -442,10 +442,17 @@ def build_model(self):
             obs_error = pm.HalfNormal("σ_model_fmc", cfg["obs_error"])
 
             # Model
-            y_model = pm.Deterministic("y_model", intercept + slope * x, dims='observation')
+            y_model = pm.Deterministic("y_model", intercept + slope * x, dims="observation")
 
             # observed data
-            y_hat = pm.Normal("y_hat", y_model, sigma=obs_error, shape=x.shape, observed=y_data, dims='observation')
+            y_hat = pm.Normal(
+                "y_hat",
+                y_model,
+                sigma=obs_error,
+                shape=x.shape,
+                observed=y_data,
+                dims="observation",
+            )
             self.output_var = "y_hat"
 
     def _data_setter(self, X, y=None):