From 74eda3e6353c02a05de26d4f0e77d97e87f8b492 Mon Sep 17 00:00:00 2001 From: Paul Brown Date: Tue, 13 Jun 2023 19:09:16 +0300 Subject: [PATCH 1/4] First pass at refactoring ModelBuilder.fit Added method-specific inference functions to ModelBuilder and made fit call them based on a fit_method parameter. Tests coming in the next commit. Also need to try to factor out code common to both methods. --- pymc_experimental/model_builder.py | 70 ++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/pymc_experimental/model_builder.py b/pymc_experimental/model_builder.py index c3550712..a0b6529a 100644 --- a/pymc_experimental/model_builder.py +++ b/pymc_experimental/model_builder.py @@ -24,6 +24,8 @@ import pandas as pd import pymc as pm import xarray as xr +from pymc.backends import NDArray +from pymc.backends.base import MultiTrace from pymc.util import RandomState # If scikit-learn is available, use its data validator @@ -422,6 +424,36 @@ def load(cls, fname: str): return model def fit( + self, + X: pd.DataFrame, + y: Optional[pd.Series] = None, + fit_method="mcmc", + **kwargs: Any, + ) -> az.InferenceData: + """ + Fit a model using the data passed as a parameter. + Sets attrs to inference data of the model. + + + Parameters + ---------- + X : array-like if sklearn is available, otherwise array, shape (n_obs, n_features) + The training input samples. + y : array-like if sklearn is available, otherwise array, shape (n_obs,) + The target values (real numbers). + fit_method : str + Which method to use to infer model parameters. One of ["mcmc", "MAP"]. + **kwargs : Any + Parameters to pass to the inference method. See `_fit_mcmc` or `_fit_MAP` for + method-specific parameters. + """ + if fit_method == "mcmc": + return self._fit_mcmc(X, y, **kwargs) + if fit_method == "MAP": + return self._fit_MAP(X, y, **kwargs) + raise ValueError(f"Inference method {fit_method} not found. Choose one of ['mcmc', 'MAP'].") + + def _fit_mcmc( self, X: pd.DataFrame, y: Optional[pd.Series] = None, @@ -482,6 +514,44 @@ def fit( self.idata.add_groups(fit_data=combined_data.to_xarray()) # type: ignore return self.idata # type: ignore + def _fit_MAP( + self, + X: pd.DataFrame, + y: Optional[pd.Series] = None, + predictor_names: List[str] = None, + **kwargs, + ): + """Find model maximum a posteriori using scipy optimizer""" + + if predictor_names is None: + predictor_names = [] + if y is None: + y = np.zeros(X.shape[0]) + y = pd.DataFrame({self.output_var: y}) + self.generate_and_preprocess_model_data(X, y.values.flatten()) + self.build_model(self.X, self.y) + model = self.model + + map_res = pm.find_MAP(model=model, **kwargs) + # Filter non-value variables + value_vars_names = {v.name for v in model.value_vars} + map_res = {k: v for k, v in map_res.items() if k in value_vars_names} + + # Convert map result to InferenceData + map_strace = NDArray(model=model) + map_strace.setup(draws=1, chain=0) + map_strace.record(map_res) + map_strace.close() + trace = MultiTrace([map_strace]) + self.idata = pm.to_inference_data(trace, model=model) + self.set_idata_attrs(self.idata) + + X_df = pd.DataFrame(X, columns=X.columns) + combined_data = pd.concat([X_df, y], axis=1) + assert all(combined_data.columns), "All columns must have non-empty names" + self.idata.add_groups(fit_data=combined_data.to_xarray()) # type: ignore + return self.idata + def predict( self, X_pred: Union[np.ndarray, pd.DataFrame, pd.Series], From 36eb7c85b1cf1619651401f1aa68f2914a884764 Mon Sep 17 00:00:00 2001 From: Paul Brown Date: Tue, 13 Jun 2023 19:12:22 +0300 Subject: [PATCH 2/4] Add tests for ModelBuilder.fit with different fit_method values Added fit_method as a parameter to the tests so that test models are tested with mcmc and MAP fitting methods. --- pymc_experimental/tests/test_model_builder.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pymc_experimental/tests/test_model_builder.py b/pymc_experimental/tests/test_model_builder.py index dd4a88ab..63406ee6 100644 --- a/pymc_experimental/tests/test_model_builder.py +++ b/pymc_experimental/tests/test_model_builder.py @@ -40,8 +40,8 @@ def toy_y(toy_X): return y -@pytest.fixture(scope="module") -def fitted_model_instance(toy_X, toy_y): +@pytest.fixture(scope="module", params=["mcmc", "MAP"]) +def fitted_model_instance(request, toy_X, toy_y): sampler_config = { "draws": 500, "tune": 300, @@ -54,12 +54,11 @@ def fitted_model_instance(toy_X, toy_y): "obs_error": 2, } model = test_ModelBuilder(model_config=model_config, sampler_config=sampler_config) - model.fit(toy_X) + model.fit(toy_X, toy_y, fit_method=request.param) return model class test_ModelBuilder(ModelBuilder): - _model_type = "LinearModel" version = "0.1" @@ -151,9 +150,10 @@ def test_fit(fitted_model_instance): post_pred[fitted_model_instance.output_var].shape[0] == prediction_data.input.shape -def test_fit_no_y(toy_X): +@pytest.mark.parametrize("fit_method", ["mcmc", "MAP"]) +def test_fit_no_y(toy_X, fit_method): model_builder = test_ModelBuilder() - model_builder.idata = model_builder.fit(X=toy_X) + model_builder.idata = model_builder.fit(X=toy_X, fit_method=fit_method) assert model_builder.model is not None assert model_builder.idata is not None assert "posterior" in model_builder.idata.groups() @@ -166,7 +166,7 @@ def test_save_load(fitted_model_instance): temp = tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False) fitted_model_instance.save(temp.name) test_builder2 = test_ModelBuilder.load(temp.name) - assert fitted_model_instance.idata.groups() == test_builder2.idata.groups() + assert sorted(fitted_model_instance.idata.groups()) == sorted(test_builder2.idata.groups()) x_pred = np.random.uniform(low=0, high=1, size=100) prediction_data = pd.DataFrame({"input": x_pred}) @@ -193,8 +193,8 @@ def test_sample_posterior_predictive(fitted_model_instance, combined): pred = fitted_model_instance.sample_posterior_predictive( prediction_data["input"], combined=combined, extend_idata=True ) - chains = fitted_model_instance.idata.sample_stats.dims["chain"] - draws = fitted_model_instance.idata.sample_stats.dims["draw"] + chains = fitted_model_instance.idata.posterior.dims["chain"] + draws = fitted_model_instance.idata.posterior.dims["draw"] expected_shape = (n_pred, chains * draws) if combined else (chains, draws, n_pred) assert pred[fitted_model_instance.output_var].shape == expected_shape assert np.issubdtype(pred[fitted_model_instance.output_var].dtype, np.floating) From e2b203beea6cc530a341e76f1a663ef34872e598 Mon Sep 17 00:00:00 2001 From: Paul Brown Date: Tue, 13 Jun 2023 19:16:51 +0300 Subject: [PATCH 3/4] Use context manager for tempfile When test_save_load failed, pytest complained about the temp file not being closed. It didn't seem to cause any problems, but maybe this is nicer. --- pymc_experimental/tests/test_model_builder.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pymc_experimental/tests/test_model_builder.py b/pymc_experimental/tests/test_model_builder.py index 63406ee6..18bfce7c 100644 --- a/pymc_experimental/tests/test_model_builder.py +++ b/pymc_experimental/tests/test_model_builder.py @@ -163,9 +163,9 @@ def test_fit_no_y(toy_X, fit_method): sys.platform == "win32", reason="Permissions for temp files not granted on windows CI." ) def test_save_load(fitted_model_instance): - temp = tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False) - fitted_model_instance.save(temp.name) - test_builder2 = test_ModelBuilder.load(temp.name) + with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False) as temp: + fitted_model_instance.save(temp.name) + test_builder2 = test_ModelBuilder.load(temp.name) assert sorted(fitted_model_instance.idata.groups()) == sorted(test_builder2.idata.groups()) x_pred = np.random.uniform(low=0, high=1, size=100) @@ -173,7 +173,6 @@ def test_save_load(fitted_model_instance): pred1 = fitted_model_instance.predict(prediction_data["input"]) pred2 = test_builder2.predict(prediction_data["input"]) assert pred1.shape == pred2.shape - temp.close() def test_predict(fitted_model_instance): From dfb9dc2b15a7dfb9828480c7836858dbac882fc5 Mon Sep 17 00:00:00 2001 From: Paul Brown Date: Tue, 13 Jun 2023 23:25:09 +0300 Subject: [PATCH 4/4] Refactor ModelBuilder.fit again Refactored the ModelBuilder.fit method again to eliminate code redundancy between the two fitting methods. Eliminated extra kwargs from being passed to scipy.optimize.minimize. --- pymc_experimental/model_builder.py | 112 +++++++++++++---------------- 1 file changed, 51 insertions(+), 61 deletions(-) diff --git a/pymc_experimental/model_builder.py b/pymc_experimental/model_builder.py index a0b6529a..7763a001 100644 --- a/pymc_experimental/model_builder.py +++ b/pymc_experimental/model_builder.py @@ -428,35 +428,6 @@ def fit( X: pd.DataFrame, y: Optional[pd.Series] = None, fit_method="mcmc", - **kwargs: Any, - ) -> az.InferenceData: - """ - Fit a model using the data passed as a parameter. - Sets attrs to inference data of the model. - - - Parameters - ---------- - X : array-like if sklearn is available, otherwise array, shape (n_obs, n_features) - The training input samples. - y : array-like if sklearn is available, otherwise array, shape (n_obs,) - The target values (real numbers). - fit_method : str - Which method to use to infer model parameters. One of ["mcmc", "MAP"]. - **kwargs : Any - Parameters to pass to the inference method. See `_fit_mcmc` or `_fit_MAP` for - method-specific parameters. - """ - if fit_method == "mcmc": - return self._fit_mcmc(X, y, **kwargs) - if fit_method == "MAP": - return self._fit_MAP(X, y, **kwargs) - raise ValueError(f"Inference method {fit_method} not found. Choose one of ['mcmc', 'MAP'].") - - def _fit_mcmc( - self, - X: pd.DataFrame, - y: Optional[pd.Series] = None, progressbar: bool = True, predictor_names: List[str] = None, random_seed: RandomState = None, @@ -473,6 +444,8 @@ def _fit_mcmc( The training input samples. y : array-like if sklearn is available, otherwise array, shape (n_obs,) The target values (real numbers). + fit_method : str + Which method to use to infer model parameters. One of ["mcmc", "MAP"]. progressbar : bool Specifies whether the fit progressbar should be displayed predictor_names: List[str] = None, @@ -481,19 +454,14 @@ def _fit_mcmc( random_seed : RandomState Provides sampler with initial random seed for obtaining reproducible samples **kwargs : Any - Custom sampler settings can be provided in form of keyword arguments. - - Returns - ------- - self : az.InferenceData - returns inference data of the fitted model. - Examples - -------- - >>> model = MyModel() - >>> idata = model.fit(data) - Auto-assigning NUTS sampler... - Initializing NUTS using jitter+adapt_diag... + Parameters to pass to the inference method. See `_fit_mcmc` or `_fit_MAP` for + method-specific parameters. """ + available_methods = ["mcmc", "MAP"] + if fit_method not in available_methods: + raise ValueError( + f"Inference method {fit_method} not found. Choose one of {available_methods}." + ) if predictor_names is None: predictor_names = [] if y is None: @@ -506,7 +474,11 @@ def _fit_mcmc( sampler_config["progressbar"] = progressbar sampler_config["random_seed"] = random_seed sampler_config.update(**kwargs) - self.idata = self.sample_model(**sampler_config) + + if fit_method == "mcmc": + self.idata = self.sample_model(**sampler_config) + elif fit_method == "MAP": + self.idata = self._fit_MAP(**sampler_config) X_df = pd.DataFrame(X, columns=X.columns) combined_data = pd.concat([X_df, y], axis=1) @@ -516,23 +488,45 @@ def _fit_mcmc( def _fit_MAP( self, - X: pd.DataFrame, - y: Optional[pd.Series] = None, - predictor_names: List[str] = None, **kwargs, ): """Find model maximum a posteriori using scipy optimizer""" - if predictor_names is None: - predictor_names = [] - if y is None: - y = np.zeros(X.shape[0]) - y = pd.DataFrame({self.output_var: y}) - self.generate_and_preprocess_model_data(X, y.values.flatten()) - self.build_model(self.X, self.y) model = self.model - - map_res = pm.find_MAP(model=model, **kwargs) + find_MAP_args = {**self.sampler_config, **kwargs} + if "random_seed" in find_MAP_args: + # find_MAP takes a different argument name for seed than sample_* do. + find_MAP_args["seed"] = find_MAP_args["random_seed"] + # Extra unknown arguments cause problems for SciPy minimize + allowed_args = [ # find_MAP args + "start", + "vars", + "method", + # "return_raw", # probably causes a problem if set spuriously + # "include_transformed", # probably causes a problem if set spuriously + "progressbar", + "maxeval", + "seed", + ] + allowed_args += [ # scipy.optimize.minimize args + # "fun", # used by find_MAP + # "x0", # used by find_MAP + "args", + "method", + # "jac", # used by find_MAP + # "hess", # probably causes a problem if set spuriously + # "hessp", # probably causes a problem if set spuriously + "bounds", + "constraints", + "tol", + "callback", + "options", + ] + for arg in list(find_MAP_args): + if arg not in allowed_args: + del find_MAP_args[arg] + + map_res = pm.find_MAP(model=model, **find_MAP_args) # Filter non-value variables value_vars_names = {v.name for v in model.value_vars} map_res = {k: v for k, v in map_res.items() if k in value_vars_names} @@ -543,14 +537,10 @@ def _fit_MAP( map_strace.record(map_res) map_strace.close() trace = MultiTrace([map_strace]) - self.idata = pm.to_inference_data(trace, model=model) - self.set_idata_attrs(self.idata) + idata = pm.to_inference_data(trace, model=model) + self.set_idata_attrs(idata) - X_df = pd.DataFrame(X, columns=X.columns) - combined_data = pd.concat([X_df, y], axis=1) - assert all(combined_data.columns), "All columns must have non-empty names" - self.idata.add_groups(fit_data=combined_data.to_xarray()) # type: ignore - return self.idata + return idata def predict( self,