From d0a1f512b1b280d1e1598eb4f0f8ed06707c6fa4 Mon Sep 17 00:00:00 2001
From: Ben Mares <services-git-throwaway1@tensorial.com>
Date: Wed, 23 Aug 2023 15:38:32 +0200
Subject: [PATCH 1/6] Fix Stucchio URL

The backslashes are appearing in the actual URL
---
 examples/references.bib | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/references.bib b/examples/references.bib
index f28c8be17..08f6a60d5 100644
--- a/examples/references.bib
+++ b/examples/references.bib
@@ -648,7 +648,7 @@ @online{stucchio2015bayesian
   title         = {Bayesian A/B Testing at VWO},
   author        = {Stucchio, Chris},
   year          = {2015},
-  url           = {https://vwo.com/downloads/VWO\_SmartStats\_technical\_whitepaper.pdf}
+  url           = {https://vwo.com/downloads/VWO_SmartStats_technical_whitepaper.pdf}
 }
 @misc{szegedy2014going,
   title         = {Going Deeper with Convolutions},

From 421beefa67c150004dc356ad702a9a3356773904 Mon Sep 17 00:00:00 2001
From: Ben Mares <services-git-throwaway1@tensorial.com>
Date: Fri, 1 Sep 2023 05:59:42 +0200
Subject: [PATCH 2/6] Update bibtex-tidy

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ee175b97b..982f05f10 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -32,7 +32,7 @@ repos:
             |examples/samplers/MLDA_variance_reduction_linear_regression\.ipynb$
 
 - repo: https://github.com/FlamingTempura/bibtex-tidy
-  rev: v1.8.5
+  rev: v1.11.0
   hooks:
     - id: bibtex-tidy
       files: examples/references.bib

From 0a0c6f1703691999d87b2db4264eb75c2226aa6f Mon Sep 17 00:00:00 2001
From: Ben Mares <services-git-throwaway1@tensorial.com>
Date: Fri, 1 Sep 2023 06:01:15 +0200
Subject: [PATCH 3/6] Fix Padonou URL

---
 examples/references.bib | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/references.bib b/examples/references.bib
index 08f6a60d5..92e518656 100644
--- a/examples/references.bib
+++ b/examples/references.bib
@@ -555,7 +555,7 @@ @unpublished{padonou2015polar
   note          = {working paper or preprint},
   year          = {2015},
   month         = Feb,
-  pdf           = {https://hal.archives-ouvertes.fr/hal-01119942v1/file/PolarGP\_CircularDomains.pdf}
+  pdf           = {https://hal.archives-ouvertes.fr/hal-01119942v1/file/PolarGP_CircularDomains.pdf}
 }
 @book{pearl2000causality,
   title         = {Causality: Models, reasoning and inference},

From 529a46315814a071c11b020486841c9188c5eb0f Mon Sep 17 00:00:00 2001
From: Ben Mares <services-git-throwaway1@tensorial.com>
Date: Fri, 1 Sep 2023 06:27:20 +0200
Subject: [PATCH 4/6] =?UTF-8?q?Increase=20Node=20version=2015=E2=86=9218?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/pre-commit.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index ad7c70940..a373b9d96 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -15,5 +15,5 @@ jobs:
     - uses: actions/setup-python@v2
     - uses: actions/setup-node@v2
       with:
-        node-version: '15'
+        node-version: '18'
     - uses: pre-commit/action@v2.0.0

From 5b6b7cd6fb63e0fc88c0ef6d36b1f212100afb5c Mon Sep 17 00:00:00 2001
From: Ben Mares <services-git-throwaway1@tensorial.com>
Date: Fri, 1 Sep 2023 06:33:32 +0200
Subject: [PATCH 5/6] Run pre-commit autoupdate

---
 .pre-commit-config.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 982f05f10..ca8121c68 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,10 +1,10 @@
 repos:
 - repo: https://github.com/psf/black
-  rev: 22.3.0
+  rev: 23.7.0
   hooks:
     - id: black-jupyter
 - repo: https://github.com/nbQA-dev/nbQA
-  rev: 1.1.0
+  rev: 1.7.0
   hooks:
     - id: nbqa-isort
       additional_dependencies: [isort==5.6.4]
@@ -12,7 +12,7 @@ repos:
       additional_dependencies: [pyupgrade==2.7.4]
       args: [--py37-plus]
 - repo: https://github.com/MarcoGorelli/madforhooks
-  rev: 0.3.0
+  rev: 0.4.1
   hooks:
     - id: check-execution-order
       args: [--strict]
@@ -96,7 +96,7 @@ repos:
       language: pygrep
       types_or: [markdown, rst, jupyter]
 - repo: https://github.com/mwouts/jupytext
-  rev: v1.13.7
+  rev: v1.15.1
   hooks:
   - id: jupytext
     files: ^examples/.+\.ipynb$

From 5b71c0cd242eb865457f814364a9aebaac496a31 Mon Sep 17 00:00:00 2001
From: Ben Mares <services-git-throwaway1@tensorial.com>
Date: Fri, 1 Sep 2023 06:34:19 +0200
Subject: [PATCH 6/6] Run pre-commit on all files

---
 examples/case_studies/GEV.myst.md             |   6 +-
 .../Missing_Data_Imputation.ipynb             |   2 -
 .../Missing_Data_Imputation.myst.md           |   2 -
 .../bart_heteroscedasticity.myst.md           |   6 --
 examples/case_studies/binning.myst.md         |  40 -------
 .../blackbox_external_likelihood.ipynb        |   1 -
 .../blackbox_external_likelihood.myst.md      |   1 -
 .../blackbox_external_likelihood_numpy.ipynb  |   1 -
 ...blackbox_external_likelihood_numpy.myst.md |   1 -
 .../conditional_autoregressive_priors.myst.md |   4 -
 examples/case_studies/disaster_model.py       |   1 -
 .../case_studies/disaster_model_theano_op.py  |   1 -
 examples/case_studies/gelman_bioassay.py      |   1 -
 examples/case_studies/gelman_schools.py       |   1 -
 .../hierarchical_partial_pooling.ipynb        |   2 -
 .../hierarchical_partial_pooling.myst.md      |   2 -
 examples/case_studies/item_response_nba.ipynb |   3 +-
 .../case_studies/item_response_nba.myst.md    |  21 +---
 .../case_studies/longitudinal_models.myst.md  |  30 ------
 .../case_studies/multilevel_modeling.ipynb    |   4 -
 .../case_studies/multilevel_modeling.myst.md  |   4 -
 ...eliability_and_calibrated_prediction.ipynb |   2 -
 ...iability_and_calibrated_prediction.myst.md |   2 -
 examples/case_studies/spline.myst.md          |   4 +-
 examples/causal_inference/excess_deaths.ipynb |   1 -
 .../causal_inference/excess_deaths.myst.md    |   3 +-
 .../interrupted_time_series.myst.md           |   2 -
 .../interventional_distribution.myst.md       |  61 +++++------
 .../regression_discontinuity.myst.md          |   6 +-
 .../model_averaging.myst.md                   |  35 ++----
 .../GP-MeansAndCovs.myst.md                   | 101 ++++++------------
 .../gaussian_processes/GP-smoothing.ipynb     |   1 -
 .../gaussian_processes/GP-smoothing.myst.md   |   1 -
 .../MOGP-Coregion-Hadamard.myst.md            |   4 -
 .../gaussian_processes/gaussian_process.ipynb |   1 -
 .../gaussian_process.myst.md                  |   1 -
 .../GLM-binomial-regression.myst.md           |   6 --
 .../GLM-hierarchical-binomial-model.ipynb     |   3 -
 .../GLM-hierarchical-binomial-model.myst.md   |   3 -
 .../GLM-model-selection.ipynb                 |   1 -
 .../GLM-model-selection.myst.md               |   1 -
 .../GLM-ordinal-regression.ipynb              |   2 -
 .../GLM-ordinal-regression.myst.md            |  74 -------------
 .../GLM-poisson-regression.ipynb              |   1 -
 .../GLM-poisson-regression.myst.md            |  74 +++++--------
 .../GLM-robust-with-outlier-detection.ipynb   |   3 -
 .../GLM-robust-with-outlier-detection.myst.md |   3 -
 .../GLM-truncated-censored-regression.myst.md |  36 -------
 examples/howto/LKJ_correlation.py             |   1 -
 examples/howto/custom_dists.py                |   1 +
 examples/howto/lasso_block_update.myst.md     |   8 --
 examples/howto/model_builder.myst.md          |   4 -
 examples/howto/sampling_callback.ipynb        |   1 -
 examples/howto/sampling_callback.myst.md      |   1 -
 examples/howto/updating_priors.ipynb          |   2 -
 examples/howto/updating_priors.myst.md        |   2 -
 ...arginalized_gaussian_mixture_model.myst.md |  18 +---
 .../ODE_Lotka_Volterra_multiple_ways.ipynb    |   3 -
 .../ODE_Lotka_Volterra_multiple_ways.myst.md  |  19 ++--
 .../ODE_with_manual_gradients.ipynb           |   4 -
 .../ODE_with_manual_gradients.myst.md         |   4 -
 .../DEMetropolisZ_EfficiencyComparison.ipynb  |   2 -
 ...DEMetropolisZ_EfficiencyComparison.myst.md |   2 -
 .../DEMetropolisZ_tune_drop_fraction.myst.md  |  10 +-
 .../samplers/MLDA_gravity_surveying.ipynb     |   9 --
 .../samplers/MLDA_gravity_surveying.myst.md   |   9 --
 .../SMC-ABC_Lotka-Volterra_example.ipynb      |   1 +
 .../SMC-ABC_Lotka-Volterra_example.myst.md    |   1 +
 .../survival_analysis/survival_analysis.ipynb |   2 -
 .../survival_analysis.myst.md                 |   2 -
 .../time_series/Euler-Maruyama_and_SDEs.ipynb |   1 -
 .../Euler-Maruyama_and_SDEs.myst.md           |   1 -
 .../MvGaussianRandomWalk_demo.myst.md         |   4 +-
 .../GLM-hierarchical-advi-minibatch.ipynb     |   4 -
 .../GLM-hierarchical-advi-minibatch.myst.md   |   4 -
 .../variational_api_quickstart.ipynb          |   6 --
 .../variational_api_quickstart.myst.md        |   6 --
 sphinxext/thumbnail_extractor.py              |   1 -
 78 files changed, 127 insertions(+), 572 deletions(-)

diff --git a/examples/case_studies/GEV.myst.md b/examples/case_studies/GEV.myst.md
index eecc303c1..d889df37a 100644
--- a/examples/case_studies/GEV.myst.md
+++ b/examples/case_studies/GEV.myst.md
@@ -10,8 +10,6 @@ kernelspec:
   name: pymc4-dev
 ---
 
-+++ {"tags": []}
-
 # Generalized Extreme Value Distribution
 
 :::{post} Sept 27, 2022
@@ -20,7 +18,7 @@ kernelspec:
 :author: Colin Caprani 
 :::
 
-+++ {"tags": []}
++++
 
 ## Introduction
 
@@ -94,8 +92,6 @@ And now set up the model using priors estimated from a quick review of the histo
 - $\xi$: we are agnostic to the tail behaviour so centre this at zero, but limit to physically reasonable bounds of $\pm 0.6$, and keep it somewhat tight near zero.
 
 ```{code-cell} ipython3
-:tags: []
-
 # Optionally centre the data, depending on fitting and divergences
 # cdata = (data - data.mean())/data.std()
 
diff --git a/examples/case_studies/Missing_Data_Imputation.ipynb b/examples/case_studies/Missing_Data_Imputation.ipynb
index 29e64c500..93011664d 100644
--- a/examples/case_studies/Missing_Data_Imputation.ipynb
+++ b/examples/case_studies/Missing_Data_Imputation.ipynb
@@ -1934,7 +1934,6 @@
     "\n",
     "\n",
     "def make_model(priors, normal_pred_assumption=True):\n",
-    "\n",
     "    coords = {\n",
     "        \"alpha_dim\": [\"lmx_imputed\", \"climate_imputed\", \"empower_imputed\"],\n",
     "        \"beta_dim\": [\n",
@@ -8628,7 +8627,6 @@
     "\n",
     "\n",
     "with pm.Model(coords=coords) as hierarchical_model:\n",
-    "\n",
     "    # Priors\n",
     "    company_beta_lmx = pm.Normal(\"company_beta_lmx\", 0, 1)\n",
     "    company_beta_male = pm.Normal(\"company_beta_male\", 0, 1)\n",
diff --git a/examples/case_studies/Missing_Data_Imputation.myst.md b/examples/case_studies/Missing_Data_Imputation.myst.md
index 9a533fba5..082db662c 100644
--- a/examples/case_studies/Missing_Data_Imputation.myst.md
+++ b/examples/case_studies/Missing_Data_Imputation.myst.md
@@ -426,7 +426,6 @@ priors = {
 
 
 def make_model(priors, normal_pred_assumption=True):
-
     coords = {
         "alpha_dim": ["lmx_imputed", "climate_imputed", "empower_imputed"],
         "beta_dim": [
@@ -707,7 +706,6 @@ coords = {"team": teams, "employee": np.arange(len(df_employee))}
 
 
 with pm.Model(coords=coords) as hierarchical_model:
-
     # Priors
     company_beta_lmx = pm.Normal("company_beta_lmx", 0, 1)
     company_beta_male = pm.Normal("company_beta_male", 0, 1)
diff --git a/examples/case_studies/bart_heteroscedasticity.myst.md b/examples/case_studies/bart_heteroscedasticity.myst.md
index edcf1a09d..271e9122f 100644
--- a/examples/case_studies/bart_heteroscedasticity.myst.md
+++ b/examples/case_studies/bart_heteroscedasticity.myst.md
@@ -24,8 +24,6 @@ kernelspec:
 In this notebook we show how to use BART to model heteroscedasticity as described in Section 4.1 of [`pymc-bart`](https://github.com/pymc-devs/pymc-bart)'s paper {cite:p}`quiroga2022bart`. We use the `marketing` data set provided by the R package `datarium` {cite:p}`kassambara2019datarium`. The idea is to model a marketing channel contribution to sales as a function of budget.
 
 ```{code-cell} ipython3
-:tags: []
-
 import os
 
 import arviz as az
@@ -37,8 +35,6 @@ import pymc_bart as pmb
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 %config InlineBackend.figure_format = "retina"
 az.style.use("arviz-darkgrid")
 plt.rcParams["figure.figsize"] = [10, 6]
@@ -157,8 +153,6 @@ The fit looks good! In fact, we see that the mean and variance increase as a fun
 ## Watermark
 
 ```{code-cell} ipython3
-:tags: []
-
 %load_ext watermark
 %watermark -n -u -v -iv -w -p pytensor
 ```
diff --git a/examples/case_studies/binning.myst.md b/examples/case_studies/binning.myst.md
index 3b0220c74..002deefdd 100644
--- a/examples/case_studies/binning.myst.md
+++ b/examples/case_studies/binning.myst.md
@@ -106,8 +106,6 @@ Hypothetically we could have used base python, or numpy, to describe the generat
 The approach was illustrated with a Gaussian distribution, and below we show a number of worked examples using Gaussian distributions. However, the approach is general, and at the end of the notebook we provide a demonstration that the approach does indeed extend to non-Gaussian distributions.
 
 ```{code-cell} ipython3
-:tags: []
-
 import warnings
 
 import arviz as az
@@ -219,8 +217,6 @@ We will start by investigating what happens when we use only one set of bins to
 ### Model specification
 
 ```{code-cell} ipython3
-:tags: []
-
 with pm.Model() as model1:
     sigma = pm.HalfNormal("sigma")
     mu = pm.Normal("mu")
@@ -235,8 +231,6 @@ pm.model_to_graphviz(model1)
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 with model1:
     trace1 = pm.sample()
 ```
@@ -248,8 +242,6 @@ Given the posterior values,
 we should be able to generate observations that look close to what we observed.
 
 ```{code-cell} ipython3
-:tags: []
-
 with model1:
     ppc = pm.sample_posterior_predictive(trace1)
 ```
@@ -294,22 +286,16 @@ The more important question is whether we have recovered the parameters of the d
 Recall that we used `mu = -2` and `sigma = 2` to generate the data.
 
 ```{code-cell} ipython3
-:tags: []
-
 az.plot_posterior(trace1, var_names=["mu", "sigma"], ref_val=[true_mu, true_sigma]);
 ```
 
 Pretty good! And we can access the posterior mean estimates (stored as [xarray](http://xarray.pydata.org/en/stable/index.html) types) as below. The MCMC samples arrive back in a 2D matrix with one dimension for the MCMC chain (`chain`), and one for the sample number (`draw`). We can calculate the overall posterior average with `.mean(dim=["draw", "chain"])`.
 
 ```{code-cell} ipython3
-:tags: []
-
 trace1.posterior["mu"].mean(dim=["draw", "chain"]).values
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 trace1.posterior["sigma"].mean(dim=["draw", "chain"]).values
 ```
 
@@ -324,8 +310,6 @@ Above, we used one set of binned data. Let's see what happens when we swap out f
 As with the above, here's the model specification.
 
 ```{code-cell} ipython3
-:tags: []
-
 with pm.Model() as model2:
     sigma = pm.HalfNormal("sigma")
     mu = pm.Normal("mu")
@@ -336,15 +320,11 @@ with pm.Model() as model2:
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 with model2:
     trace2 = pm.sample()
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 az.plot_trace(trace2);
 ```
 
@@ -353,8 +333,6 @@ az.plot_trace(trace2);
 Let's run a PPC check to ensure we are generating data that are similar to what we observed.
 
 ```{code-cell} ipython3
-:tags: []
-
 with model2:
     ppc = pm.sample_posterior_predictive(trace2)
 ```
@@ -362,14 +340,10 @@ with model2:
 We calculate the mean bin posterior predictive bin counts, averaged over samples.
 
 ```{code-cell} ipython3
-:tags: []
-
 ppc.posterior_predictive.counts2.mean(dim=["chain", "draw"]).values
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 c2.values
 ```
 
@@ -399,14 +373,10 @@ az.plot_posterior(trace2, var_names=["mu", "sigma"], ref_val=[true_mu, true_sigm
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 trace2.posterior["mu"].mean(dim=["draw", "chain"]).values
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 trace2.posterior["sigma"].mean(dim=["draw", "chain"]).values
 ```
 
@@ -419,8 +389,6 @@ Now we need to see what happens if we add in both ways of binning.
 ### Model Specification
 
 ```{code-cell} ipython3
-:tags: []
-
 with pm.Model() as model3:
     sigma = pm.HalfNormal("sigma")
     mu = pm.Normal("mu")
@@ -442,8 +410,6 @@ pm.model_to_graphviz(model3)
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 with model3:
     trace3 = pm.sample()
 ```
@@ -488,20 +454,14 @@ ax[1].set_title("Seven bin discretization of N(-2, 2)")
 ### Recovering parameters
 
 ```{code-cell} ipython3
-:tags: []
-
 trace3.posterior["mu"].mean(dim=["draw", "chain"]).values
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 trace3.posterior["sigma"].mean(dim=["draw", "chain"]).values
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 az.plot_posterior(trace3, var_names=["mu", "sigma"], ref_val=[true_mu, true_sigma]);
 ```
 
diff --git a/examples/case_studies/blackbox_external_likelihood.ipynb b/examples/case_studies/blackbox_external_likelihood.ipynb
index 1b1f038dd..f9ca30802 100644
--- a/examples/case_studies/blackbox_external_likelihood.ipynb
+++ b/examples/case_studies/blackbox_external_likelihood.ipynb
@@ -499,7 +499,6 @@
    "source": [
     "# define a theano Op for our likelihood function\n",
     "class LogLikeWithGrad(tt.Op):\n",
-    "\n",
     "    itypes = [tt.dvector]  # expects a vector of parameter values when called\n",
     "    otypes = [tt.dscalar]  # outputs a single scalar value (the log likelihood)\n",
     "\n",
diff --git a/examples/case_studies/blackbox_external_likelihood.myst.md b/examples/case_studies/blackbox_external_likelihood.myst.md
index 9d21c84a6..12a145201 100644
--- a/examples/case_studies/blackbox_external_likelihood.myst.md
+++ b/examples/case_studies/blackbox_external_likelihood.myst.md
@@ -423,7 +423,6 @@ It's not quite so simple! The `grad()` method itself requires that its inputs ar
 ```{code-cell} ipython3
 # define a theano Op for our likelihood function
 class LogLikeWithGrad(tt.Op):
-
     itypes = [tt.dvector]  # expects a vector of parameter values when called
     otypes = [tt.dscalar]  # outputs a single scalar value (the log likelihood)
 
diff --git a/examples/case_studies/blackbox_external_likelihood_numpy.ipynb b/examples/case_studies/blackbox_external_likelihood_numpy.ipynb
index 265a9a20f..8ca0cf85d 100644
--- a/examples/case_studies/blackbox_external_likelihood_numpy.ipynb
+++ b/examples/case_studies/blackbox_external_likelihood_numpy.ipynb
@@ -438,7 +438,6 @@
    "source": [
     "# define a pytensor Op for our likelihood function\n",
     "class LogLikeWithGrad(pt.Op):\n",
-    "\n",
     "    itypes = [pt.dvector]  # expects a vector of parameter values when called\n",
     "    otypes = [pt.dscalar]  # outputs a single scalar value (the log likelihood)\n",
     "\n",
diff --git a/examples/case_studies/blackbox_external_likelihood_numpy.myst.md b/examples/case_studies/blackbox_external_likelihood_numpy.myst.md
index 5cf327634..eac5ba1e9 100644
--- a/examples/case_studies/blackbox_external_likelihood_numpy.myst.md
+++ b/examples/case_studies/blackbox_external_likelihood_numpy.myst.md
@@ -246,7 +246,6 @@ It's not quite so simple! The `grad()` method itself requires that its inputs ar
 ```{code-cell} ipython3
 # define a pytensor Op for our likelihood function
 class LogLikeWithGrad(pt.Op):
-
     itypes = [pt.dvector]  # expects a vector of parameter values when called
     otypes = [pt.dscalar]  # outputs a single scalar value (the log likelihood)
 
diff --git a/examples/case_studies/conditional_autoregressive_priors.myst.md b/examples/case_studies/conditional_autoregressive_priors.myst.md
index 502b2f459..791912a09 100644
--- a/examples/case_studies/conditional_autoregressive_priors.myst.md
+++ b/examples/case_studies/conditional_autoregressive_priors.myst.md
@@ -13,8 +13,6 @@ myst:
     extra_dependencies: geopandas libpysal
 ---
 
-+++ {"tags": []}
-
 (conditional_autoregressive_priors)=
 # Conditional Autoregressive (CAR) Models for Spatial Data
 
@@ -82,8 +80,6 @@ except FileNotFoundError:
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 df_scot_cancer.head()
 ```
 
diff --git a/examples/case_studies/disaster_model.py b/examples/case_studies/disaster_model.py
index 1a77a8812..432c5efc3 100644
--- a/examples/case_studies/disaster_model.py
+++ b/examples/case_studies/disaster_model.py
@@ -137,7 +137,6 @@
 year = arange(1851, 1962)
 
 with pm.Model() as model:
-
     switchpoint = pm.DiscreteUniform("switchpoint", lower=year.min(), upper=year.max())
     early_mean = pm.Exponential("early_mean", lam=1.0)
     late_mean = pm.Exponential("late_mean", lam=1.0)
diff --git a/examples/case_studies/disaster_model_theano_op.py b/examples/case_studies/disaster_model_theano_op.py
index fd830854e..6f58dbb2d 100644
--- a/examples/case_studies/disaster_model_theano_op.py
+++ b/examples/case_studies/disaster_model_theano_op.py
@@ -142,7 +142,6 @@ def rate_(switchpoint, early_mean, late_mean):
 
 
 with pm.Model() as model:
-
     # Prior for distribution of switchpoint location
     switchpoint = pm.DiscreteUniform("switchpoint", lower=0, upper=years)
     # Priors for pre- and post-switch mean number of disasters
diff --git a/examples/case_studies/gelman_bioassay.py b/examples/case_studies/gelman_bioassay.py
index 032080755..3bead2e26 100644
--- a/examples/case_studies/gelman_bioassay.py
+++ b/examples/case_studies/gelman_bioassay.py
@@ -8,7 +8,6 @@
 dose = array([-0.86, -0.3, -0.05, 0.73])
 
 with pm.Model() as model:
-
     # Logit-linear model parameters
     alpha = pm.Normal("alpha", 0, sigma=100.0)
     beta = pm.Normal("beta", 0, sigma=1.0)
diff --git a/examples/case_studies/gelman_schools.py b/examples/case_studies/gelman_schools.py
index 36e72a91c..4066e9320 100644
--- a/examples/case_studies/gelman_schools.py
+++ b/examples/case_studies/gelman_schools.py
@@ -31,7 +31,6 @@
 sigma = np.array([15, 10, 16, 11, 9, 11, 10, 18])
 
 with Model() as schools:
-
     eta = Normal("eta", 0, 1, shape=J)
     mu = Normal("mu", 0, sigma=1e6)
     tau = HalfCauchy("tau", 25)
diff --git a/examples/case_studies/hierarchical_partial_pooling.ipynb b/examples/case_studies/hierarchical_partial_pooling.ipynb
index f57e05757..1604f53fd 100644
--- a/examples/case_studies/hierarchical_partial_pooling.ipynb
+++ b/examples/case_studies/hierarchical_partial_pooling.ipynb
@@ -155,7 +155,6 @@
     "coords = {\"player_names\": player_names.tolist()}\n",
     "\n",
     "with pm.Model(coords=coords) as baseball_model:\n",
-    "\n",
     "    phi = pm.Uniform(\"phi\", lower=0.0, upper=1.0)\n",
     "\n",
     "    kappa_log = pm.Exponential(\"kappa_log\", lam=1.5)\n",
@@ -186,7 +185,6 @@
    "outputs": [],
    "source": [
     "with baseball_model:\n",
-    "\n",
     "    theta_new = pm.Beta(\"theta_new\", alpha=phi * kappa, beta=(1.0 - phi) * kappa)\n",
     "    y_new = pm.Binomial(\"y_new\", n=4, p=theta_new, observed=0)"
    ]
diff --git a/examples/case_studies/hierarchical_partial_pooling.myst.md b/examples/case_studies/hierarchical_partial_pooling.myst.md
index 651952562..11c0c0421 100644
--- a/examples/case_studies/hierarchical_partial_pooling.myst.md
+++ b/examples/case_studies/hierarchical_partial_pooling.myst.md
@@ -96,7 +96,6 @@ player_names = data["FirstName"] + " " + data["LastName"]
 coords = {"player_names": player_names.tolist()}
 
 with pm.Model(coords=coords) as baseball_model:
-
     phi = pm.Uniform("phi", lower=0.0, upper=1.0)
 
     kappa_log = pm.Exponential("kappa_log", lam=1.5)
@@ -110,7 +109,6 @@ Recall our original question was with regard to the true batting average for a p
 
 ```{code-cell} ipython3
 with baseball_model:
-
     theta_new = pm.Beta("theta_new", alpha=phi * kappa, beta=(1.0 - phi) * kappa)
     y_new = pm.Binomial("y_new", n=4, p=theta_new, observed=0)
 ```
diff --git a/examples/case_studies/item_response_nba.ipynb b/examples/case_studies/item_response_nba.ipynb
index ed2c9ace9..0056fca3e 100644
--- a/examples/case_studies/item_response_nba.ipynb
+++ b/examples/case_studies/item_response_nba.ipynb
@@ -358,6 +358,7 @@
     "df_position.index.name = \"player\"\n",
     "df_position.columns = [\"position\"]\n",
     "\n",
+    "\n",
     "# 2. Create the binary foul_called variable\n",
     "def foul_called(decision):\n",
     "    \"\"\"Correct and incorrect noncalls (CNC and INC) take value 0.\n",
@@ -449,7 +450,6 @@
     "coords = {\"disadvantaged\": disadvantaged, \"committing\": committing}\n",
     "\n",
     "with pm.Model(coords=coords) as model:\n",
-    "\n",
     "    # Data\n",
     "    foul_called_observed = pm.Data(\"foul_called_observed\", df.foul_called, mutable=False)\n",
     "\n",
@@ -810,6 +810,7 @@
     "    / (1 + np.exp(-(mu_theta_mean - trace.posterior[\"b\"].mean(dim=[\"chain\", \"draw\"])))).to_pandas()\n",
     ")\n",
     "\n",
+    "\n",
     "# Compute difference of raw and posterior mean for each\n",
     "# disadvantaged and committing player\n",
     "def diff(a, b):\n",
diff --git a/examples/case_studies/item_response_nba.myst.md b/examples/case_studies/item_response_nba.myst.md
index 54989519f..c0713e90f 100644
--- a/examples/case_studies/item_response_nba.myst.md
+++ b/examples/case_studies/item_response_nba.myst.md
@@ -88,8 +88,6 @@ We first import data from the original data set, which can be found at [this URL
 We note that we  already removed  from the original  dataset the plays where less than two players are involved (for example travel calls or clock violations). Also, the original dataset does not contain information on the players' position, which we added ourselves.
 
 ```{code-cell} ipython3
-:tags: []
-
 try:
     df_orig = pd.read_csv(os.path.join("..", "data", "item_response_nba.csv"), index_col=0)
 except FileNotFoundError:
@@ -105,8 +103,6 @@ We now process our data in three steps:
 Finally, we display the head of our main dataframe `df` along with some basic statistics.
 
 ```{code-cell} ipython3
-:tags: []
-
 # 1. Construct df and df_position
 df = df_orig[["committing", "disadvantaged", "decision"]]
 
@@ -120,6 +116,7 @@ df_position = df_position[~df_position.index.duplicated(keep="first")]
 df_position.index.name = "player"
 df_position.columns = ["position"]
 
+
 # 2. Create the binary foul_called variable
 def foul_called(decision):
     """Correct and incorrect noncalls (CNC and INC) take value 0.
@@ -146,8 +143,6 @@ print(f"Global probability of a foul being called: " f"{100*round(df.foul_called
 df.head()
 ```
 
-+++ {"tags": []}
-
 ## Item Response Model
 
 ### Model definition
@@ -201,7 +196,6 @@ We now implement the model above in PyMC. Note that, to easily keep track of the
 coords = {"disadvantaged": disadvantaged, "committing": committing}
 
 with pm.Model(coords=coords) as model:
-
     # Data
     foul_called_observed = pm.Data("foul_called_observed", df.foul_called, mutable=False)
 
@@ -226,8 +220,6 @@ with pm.Model(coords=coords) as model:
 We now plot our model to show the hierarchical structure (and the non-centered parametrisation) on the variables `theta` and `b`.
 
 ```{code-cell} ipython3
-:tags: []
-
 pm.model_to_graphviz(model)
 ```
 
@@ -255,8 +247,6 @@ Our first check is to plot
 These plots show, as expected, that the hierarchical structure of our model tends to estimate posteriors towards the global mean for players with a low amount of observations.
 
 ```{code-cell} ipython3
-:tags: []
-
 # Global posterior means of μ_theta and μ_b
 mu_theta_mean, mu_b_mean = trace.posterior["mu_theta"].mean(), 0
 # Raw mean from data of each disadvantaged player
@@ -273,6 +263,7 @@ committing_posterior_mean = (
     / (1 + np.exp(-(mu_theta_mean - trace.posterior["b"].mean(dim=["chain", "draw"])))).to_pandas()
 )
 
+
 # Compute difference of raw and posterior mean for each
 # disadvantaged and committing player
 def diff(a, b):
@@ -403,7 +394,7 @@ plt.show();
 
 By visiting [Austin Rochford post](https://www.austinrochford.com/posts/2017-04-04-nba-irt.html) and checking the analogous table for the Rasch model there (which uses data from the 2016-17 season),  the reader can see that several top players in both skills are still in the top 10 with our larger data set (covering seasons 2015-16 to 2020-21).
 
-+++ {"tags": []}
++++
 
 ### Discovering extra hierarchical structure
 
@@ -467,8 +458,6 @@ These plots suggest that scoring high in `theta` does not correlate with high or
 Given the last observation, we decide to plot a histogram for the occurrence of different positions for top disadvantaged (`theta`) and committing (`b`) players. Interestingly, we see below that the largest share of best disadvantaged players are guards, meanwhile, the largest share of best committing players are centers (and at the same time a very small share of guards).
 
 ```{code-cell} ipython3
-:tags: []
-
 amount = 50  # How many top players we want to display
 top_theta_players = top_theta["disadvantaged"][:amount].values
 top_b_players = top_b["committing"][:amount].values
@@ -497,7 +486,7 @@ The histograms above suggest that it might be appropriate to add a hierarchical
 
 A warm thank you goes to [Eric Ma](https://github.com/ericmjl) for many useful comments that improved this notebook.
 
-+++ {"tags": []}
++++
 
 ## Authors
  
@@ -518,8 +507,6 @@ A warm thank you goes to [Eric Ma](https://github.com/ericmjl) for many useful c
 ## Watermark
 
 ```{code-cell} ipython3
-:tags: []
-
 %load_ext watermark
 %watermark -n -u -v -iv -w -p pytensor,aeppl,xarray
 ```
diff --git a/examples/case_studies/longitudinal_models.myst.md b/examples/case_studies/longitudinal_models.myst.md
index 0184086e3..c082edee4 100644
--- a/examples/case_studies/longitudinal_models.myst.md
+++ b/examples/case_studies/longitudinal_models.myst.md
@@ -20,8 +20,6 @@ kernelspec:
 :::
 
 ```{code-cell} ipython3
-:tags: []
-
 import arviz as az
 import bambi as bmb
 import matplotlib.pyplot as plt
@@ -37,8 +35,6 @@ lowess = sm.nonparametric.lowess
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 %config InlineBackend.figure_format = 'retina'  # high resolution figures
 az.style.use("arviz-darkgrid")
 rng = np.random.default_rng(42)
@@ -62,8 +58,6 @@ We'll follow the discussion and iterative approach to model building outlined in
 For any longitudinal analysis we need three components: (1) multiple waves of data collection (2) a suitable definition of time and (3) an outcome of interest. Combining these we can assess how the individual changes over time with respect that outcome. In this first series of models we will look at how adolescent alcohol usage varies between children from the age of 14 onwards with data collected annually over three years.
 
 ```{code-cell} ipython3
-:tags: []
-
 try:
     df = pd.read_csv("../data/alcohol1_pp.csv")
 except FileNotFoundError:
@@ -198,8 +192,6 @@ We begin with a simple unconditional model where we model only the individual's
 
 
 ```{code-cell} ipython3
-:tags: []
-
 id_indx, unique_ids = pd.factorize(df["id"])
 coords = {"ids": unique_ids, "obs": range(len(df["alcuse"]))}
 with pm.Model(coords=coords) as model:
@@ -261,8 +253,6 @@ $$ \begin{aligned}
 Fitting the model then informs us about how each individual modifies the global model, but also lets us learn global parameters. In particular we allow for a subject specific modification of the coefficient on the variable representing time. A broadly similar pattern of combination holds for all the hierarchical models we outline in the following series of models. In the Bayesian setting we're trying to learn the parameters that best fit the data. Implementing the model in PyMC is as follows:
 
 ```{code-cell} ipython3
-:tags: []
-
 id_indx, unique_ids = pd.factorize(df["id"])
 coords = {"ids": unique_ids, "obs": range(len(df["alcuse"]))}
 with pm.Model(coords=coords) as model:
@@ -349,8 +339,6 @@ $$ \begin{aligned}
 \end{aligned} $$
 
 ```{code-cell} ipython3
-:tags: []
-
 id_indx, unique_ids = pd.factorize(df["id"])
 coords = {"ids": unique_ids, "obs": range(len(df["alcuse"]))}
 with pm.Model(coords=coords) as model:
@@ -477,8 +465,6 @@ $$ \begin{aligned}
 \end{aligned} $$
 
 ```{code-cell} ipython3
-:tags: []
-
 id_indx, unique_ids = pd.factorize(df["id"])
 coords = {"ids": unique_ids, "obs": range(len(df["alcuse"]))}
 with pm.Model(coords=coords) as model:
@@ -660,8 +646,6 @@ While we're fitting these models directly within PyMC there is an alternative ba
 The formula specification uses `1` to denote an intercept term and a conditional `|` operator to denote a subject level parameter combined with the global parameter of the same type in the manner specified above. We will add subject specific modifications of the intercept term and beta coefficient on the focal variable of age as in the models above. We do so using the syntax `(1 + age_14 | id)` in the formula syntax for Bambi.
 
 ```{code-cell} ipython3
-:tags: []
-
 formula = "alcuse ~ 1 + age_14 + coa + cpeer + age_14:coa + age_14:cpeer + (1 + age_14 | id)"
 model = bmb.Model(formula, df)
 
@@ -674,8 +658,6 @@ idata_bambi
 The model is nicely specified and details the structure of hierarchical and subject level parameters. By default the Bambi model assigns priors and uses a non-centred parameterisation. The Bambi model definition uses the language of common and group level effects as opposed to the global and subject distinction we have beeen using in this example so far. Again, the important point to stress is just the hierarchy of levels, not the names.
 
 ```{code-cell} ipython3
-:tags: []
-
 model
 ```
 
@@ -686,8 +668,6 @@ model.graph()
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 az.summary(
     idata_bambi,
     var_names=[
@@ -734,8 +714,6 @@ We can see here how the bambi model specification recovers the same parameterisa
 Next we'll look at a dataset where the individual trajectories show wild swings in behaviour across the individuals. The data reports a score per child of externalizing behaviors. This can measure a variety of behaviours including but not limited to: physical aggression, verbal bullying, relational aggression, defiance, theft, and vandalism. The higher on the scale the more external behaviours demonstrated by the child. The scale is bounded at 0 and has a maximum possible score of 68. Each individual child is measured for these behaviours in each grade of school. 
 
 ```{code-cell} ipython3
-:tags: []
-
 try:
     df_external = pd.read_csv("../data/external_pp.csv")
 except FileNotFoundError:
@@ -781,8 +759,6 @@ plt.hist(np.random.gumbel(guess["mu"], guess["beta"], 1000), bins=30, ec="black"
 As before we'll begin with a fairly minimal model, specifying a hierarchical model where each individual modifies the grand mean. We allow for a non-normal censored likelihood term. 
 
 ```{code-cell} ipython3
-:tags: []
-
 id_indx, unique_ids = pd.factorize(df_external["ID"])
 coords = {"ids": unique_ids, "obs": range(len(df_external["EXTERNAL"]))}
 with pm.Model(coords=coords) as model:
@@ -834,8 +810,6 @@ ax.set_title("Distribution of Individual Modifications to the Grand Mean");
 We now model the evolution of the behaviours over time in a hierarchical fashion. We start with a simple hierarhical linear regression with a focal predictor of grade. 
 
 ```{code-cell} ipython3
-:tags: []
-
 id_indx, unique_ids = pd.factorize(df_external["ID"])
 coords = {"ids": unique_ids, "obs": range(len(df_external["EXTERNAL"]))}
 with pm.Model(coords=coords) as model:
@@ -920,8 +894,6 @@ We can see here how the model is constrained to apply a very linear fit to the b
 To give the model more flexibility to model change over time we can add in polynomial terms. 
 
 ```{code-cell} ipython3
-:tags: []
-
 id_indx, unique_ids = pd.factorize(df_external["ID"])
 coords = {"ids": unique_ids, "obs": range(len(df_external["EXTERNAL"]))}
 with pm.Model(coords=coords) as model:
@@ -1079,8 +1051,6 @@ with pm.Model(coords=coords) as model:
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 pm.model_to_graphviz(model)
 ```
 
diff --git a/examples/case_studies/multilevel_modeling.ipynb b/examples/case_studies/multilevel_modeling.ipynb
index c752f4ceb..9316c1920 100644
--- a/examples/case_studies/multilevel_modeling.ipynb
+++ b/examples/case_studies/multilevel_modeling.ipynb
@@ -1320,7 +1320,6 @@
     "    (unpooled_trace, partial_pooling_trace),\n",
     "    (\"no pooling\", \"partial pooling\"),\n",
     "):\n",
-    "\n",
     "    # add variable with x values to xarray dataset\n",
     "    trace.posterior = trace.posterior.assign_coords({\"N_county\": (\"county\", N_county)})\n",
     "    # plot means\n",
@@ -2829,7 +2828,6 @@
     "coords[\"param\"] = [\"alpha\", \"beta\"]\n",
     "coords[\"param_bis\"] = [\"alpha\", \"beta\"]\n",
     "with pm.Model(coords=coords) as covariation_intercept_slope:\n",
-    "\n",
     "    floor_idx = pm.MutableData(\"floor_idx\", floor_measure, dims=\"obs_id\")\n",
     "    county_idx = pm.MutableData(\"county_idx\", county, dims=\"obs_id\")\n",
     "\n",
@@ -2936,7 +2934,6 @@
     "coords[\"param\"] = [\"alpha\", \"beta\"]\n",
     "coords[\"param_bis\"] = [\"alpha\", \"beta\"]\n",
     "with pm.Model(coords=coords) as covariation_intercept_slope:\n",
-    "\n",
     "    floor_idx = pm.MutableData(\"floor_idx\", floor_measure, dims=\"obs_id\")\n",
     "    county_idx = pm.MutableData(\"county_idx\", county, dims=\"obs_id\")\n",
     "\n",
@@ -3161,7 +3158,6 @@
    "outputs": [],
    "source": [
     "with pm.Model(coords=coords) as hierarchical_intercept:\n",
-    "\n",
     "    # Priors\n",
     "    sigma_a = pm.HalfCauchy(\"sigma_a\", 5)\n",
     "\n",
diff --git a/examples/case_studies/multilevel_modeling.myst.md b/examples/case_studies/multilevel_modeling.myst.md
index d3bc79cd0..166bc43f5 100644
--- a/examples/case_studies/multilevel_modeling.myst.md
+++ b/examples/case_studies/multilevel_modeling.myst.md
@@ -418,7 +418,6 @@ for ax, trace, level in zip(
     (unpooled_trace, partial_pooling_trace),
     ("no pooling", "partial pooling"),
 ):
-
     # add variable with x values to xarray dataset
     trace.posterior = trace.posterior.assign_coords({"N_county": ("county", N_county)})
     # plot means
@@ -787,7 +786,6 @@ This translates quite easily in PyMC:
 coords["param"] = ["alpha", "beta"]
 coords["param_bis"] = ["alpha", "beta"]
 with pm.Model(coords=coords) as covariation_intercept_slope:
-
     floor_idx = pm.MutableData("floor_idx", floor_measure, dims="obs_id")
     county_idx = pm.MutableData("county_idx", county, dims="obs_id")
 
@@ -822,7 +820,6 @@ As you may expect, we also want to non-center the random effects here. This agai
 coords["param"] = ["alpha", "beta"]
 coords["param_bis"] = ["alpha", "beta"]
 with pm.Model(coords=coords) as covariation_intercept_slope:
-
     floor_idx = pm.MutableData("floor_idx", floor_measure, dims="obs_id")
     county_idx = pm.MutableData("county_idx", county, dims="obs_id")
 
@@ -960,7 +957,6 @@ This is fairly straightforward to implement in PyMC -- we just add another level
 
 ```{code-cell} ipython3
 with pm.Model(coords=coords) as hierarchical_intercept:
-
     # Priors
     sigma_a = pm.HalfCauchy("sigma_a", 5)
 
diff --git a/examples/case_studies/reliability_and_calibrated_prediction.ipynb b/examples/case_studies/reliability_and_calibrated_prediction.ipynb
index 2af46a85f..eb30ed396 100644
--- a/examples/case_studies/reliability_and_calibrated_prediction.ipynb
+++ b/examples/case_studies/reliability_and_calibrated_prediction.ipynb
@@ -2211,7 +2211,6 @@
    ],
    "source": [
     "def ecdf(sample):\n",
-    "\n",
     "    # convert sample to a numpy array, if it isn't already\n",
     "    sample = np.atleast_1d(sample)\n",
     "\n",
@@ -3859,7 +3858,6 @@
     "\n",
     "def make_model(p, info=False):\n",
     "    with pm.Model() as model:\n",
-    "\n",
     "        if info:\n",
     "            beta = pm.Normal(\"beta\", p[\"beta\"][0], p[\"beta\"][1])\n",
     "        else:\n",
diff --git a/examples/case_studies/reliability_and_calibrated_prediction.myst.md b/examples/case_studies/reliability_and_calibrated_prediction.myst.md
index 0d893d9db..2b28cdb83 100644
--- a/examples/case_studies/reliability_and_calibrated_prediction.myst.md
+++ b/examples/case_studies/reliability_and_calibrated_prediction.myst.md
@@ -459,7 +459,6 @@ We can use these bootstrapped statistics to further calculate quantities of the
 
 ```{code-cell} ipython3
 def ecdf(sample):
-
     # convert sample to a numpy array, if it isn't already
     sample = np.atleast_1d(sample)
 
@@ -897,7 +896,6 @@ priors_informative = {"beta": [10_000, 500], "alpha": [2, 0.5, 0.02, 3]}
 
 def make_model(p, info=False):
     with pm.Model() as model:
-
         if info:
             beta = pm.Normal("beta", p["beta"][0], p["beta"][1])
         else:
diff --git a/examples/case_studies/spline.myst.md b/examples/case_studies/spline.myst.md
index 0bd352db2..488f3286e 100644
--- a/examples/case_studies/spline.myst.md
+++ b/examples/case_studies/spline.myst.md
@@ -19,7 +19,7 @@ kernelspec:
 :author: Joshua Cook
 :::
 
-+++ {"tags": []}
++++
 
 ## Introduction
 
@@ -88,8 +88,6 @@ blossom_data.plot.scatter(
 );
 ```
 
-+++ {"tags": []}
-
 ## The model
 
 We will fit the following model.
diff --git a/examples/causal_inference/excess_deaths.ipynb b/examples/causal_inference/excess_deaths.ipynb
index 8375c9870..4277d24c6 100644
--- a/examples/causal_inference/excess_deaths.ipynb
+++ b/examples/causal_inference/excess_deaths.ipynb
@@ -450,7 +450,6 @@
    "outputs": [],
    "source": [
     "with pm.Model(coords={\"month\": month_strings}) as model:\n",
-    "\n",
     "    # observed predictors and outcome\n",
     "    month = pm.MutableData(\"month\", pre[\"month\"].to_numpy(), dims=\"t\")\n",
     "    time = pm.MutableData(\"time\", pre[\"t\"].to_numpy(), dims=\"t\")\n",
diff --git a/examples/causal_inference/excess_deaths.myst.md b/examples/causal_inference/excess_deaths.myst.md
index ea560a0c0..58acdda9c 100644
--- a/examples/causal_inference/excess_deaths.myst.md
+++ b/examples/causal_inference/excess_deaths.myst.md
@@ -52,7 +52,7 @@ We could take many different approaches to the modelling. Because we are dealing
 
 But because the focus of this case study is on the counterfactual reasoning rather than the specifics of time-series modelling, I chose the simpler approach of linear regression for time-series model (see {cite:t}`martin2021bayesian` for more on this).
 
-+++ {"tags": []}
++++
 
 ## Causal inference disclaimer
 
@@ -250,7 +250,6 @@ We are going to estimate reported deaths over time with an intercept, a linear t
 
 ```{code-cell} ipython3
 with pm.Model(coords={"month": month_strings}) as model:
-
     # observed predictors and outcome
     month = pm.MutableData("month", pre["month"].to_numpy(), dims="t")
     time = pm.MutableData("time", pre["t"].to_numpy(), dims="t")
diff --git a/examples/causal_inference/interrupted_time_series.myst.md b/examples/causal_inference/interrupted_time_series.myst.md
index ff25d5553..591c27efb 100644
--- a/examples/causal_inference/interrupted_time_series.myst.md
+++ b/examples/causal_inference/interrupted_time_series.myst.md
@@ -117,8 +117,6 @@ figsize = (10, 5)
 The focus of this example is on making causal claims using the interrupted time series approach. Therefore we will work with some relatively simple synthetic data which only requires a very simple model.
 
 ```{code-cell} ipython3
-:tags: []
-
 treatment_time = "2017-01-01"
 β0 = 0
 β1 = 0.1
diff --git a/examples/causal_inference/interventional_distribution.myst.md b/examples/causal_inference/interventional_distribution.myst.md
index f7411f86c..0806ebc05 100644
--- a/examples/causal_inference/interventional_distribution.myst.md
+++ b/examples/causal_inference/interventional_distribution.myst.md
@@ -14,7 +14,7 @@ myst:
     extra_dependencies: daft pymc_experimental
 ---
 
-+++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
 (interventional_distribution)=
 # Interventional distributions and graph mutation with the do-operator
@@ -25,7 +25,7 @@ myst:
 :author: Benjamin T. Vincent
 :::
 
-+++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
 [PyMC](https://github.com/pymc-devs/pymc) is a pivotal component of the open source Bayesian statistics ecosystem. It helps solve real problems across a wide range of industries and academic research areas every day. And it has gained this level of utility by being accessible, powerful, and practically useful at solving _Bayesian statistical inference_ problems.
 
@@ -46,7 +46,6 @@ If that sounds cool, let's dive in...
 editable: true
 slideshow:
   slide_type: ''
-tags: []
 ---
 import arviz as az
 import graphviz as gr
@@ -58,7 +57,7 @@ import seaborn as sns
 from packaging import version
 ```
 
-+++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
 :::{include} ../extra_installs.md
 :::
@@ -70,7 +69,6 @@ This notebook relies on experimental functionality currently in the [pymc-experi
 editable: true
 slideshow:
   slide_type: ''
-tags: []
 ---
 # Import additional libraries that are not dependencies of PyMC
 import daft
@@ -89,7 +87,6 @@ from pymc_experimental.model_transform.conditioning import do
 editable: true
 slideshow:
   slide_type: ''
-tags: []
 ---
 RANDOM_SEED = 123
 rng = np.random.default_rng(RANDOM_SEED)
@@ -97,7 +94,7 @@ az.style.use("arviz-darkgrid")
 %config InlineBackend.figure_format = 'retina'
 ```
 
-+++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
 ## What can we do with Bayesian inference?
 
@@ -108,7 +105,6 @@ Whether we are building _descriptive_ models or those that try to model the unde
 editable: true
 slideshow:
   slide_type: ''
-tags: []
 ---
 J = 8
 y = np.array([28, 8, -3, 7, -1, 1, 18, 12])
@@ -124,7 +120,7 @@ with pm.Model() as schools:
 pm.model_to_graphviz(schools)
 ```
 
-+++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
 Regardless of the particular model or models we are working with, we can do a whole range of _statistical_ procedures:
 
@@ -144,7 +140,7 @@ But now it's time to get smacked in our smug Bayesian face. As others have argue
 
 In our running example, this could correspond to when a central bank switches from making predictions about inflation to now _acting_ and _intervening_ in the system by, for example, changing interest rates. All of a sudden you might be faced with a situation where the economy does not respond to your intervention as you predicted.
 
-+++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
 Let's consider a seemingly trivial example with 3 nodes to see how we can get fooled. The image below shows two different causal DAGs. On the left we are interested in how $X$ causally affects $Y$, both directly and indirectly through a mediating variable $M$. If we take a purely statistical approach (e.g. {ref}`mediation_analysis`) we might find that the data is very plausibly generated by this DAG. This might give us the confidence to conduct an intervention on $M$ with the aim of influencing our target outcome, $Y$. But when we do this intervention in the real world and change $M$, we actually find absolutely no change in $Y$. What is going on here?
 
@@ -177,13 +173,13 @@ g.edge(tail_name="u", head_name="y", color="lightgrey")
 g
 ```
 
-+++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
 Little did we know, but the _actual_ data generating process is captured by the DAG on the right. This shows that $X$ does causally influence both $M$ and $Y$, however $M$ does not in fact causally affect $Y$. Instead, there is an unobserved variable $U$ which causally influences both $M$ and $Y$. This unobserved confounder creates a backdoor path in which _statistical_ association may flow between the path $X \rightarrow M \rightarrow U \rightarrow Y$. All this causes a statistical association between $M$ and $Y$ which our purely statistical approach mislead us into thinking that $M$ did causally influence $Y$ when it did not. No wonder our intervention failed to have any effects.
 
 Our mistake was to interpret a statistical model causally.
 
-+++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
 ## Statistical versus interventional distributions
 So far this has been quite high-level, but let's try to pin this down a little. In our example, if we were to take a purely statistical approach we could ask "What happened when interest rates were 2%?" This is a statistical question because we are basically looking back in our dataset and filtering (or conditioning) upon time points where interest rates were at (or very close to) 2%. So let's flag up - **conditional distributions are purely statistical quantities**.
@@ -194,7 +190,7 @@ Interventional distributions are cool because they allow us to ask what-if (or c
 
 From hereon, the main point of this notebook will be to provide some understanding and intuition about the differences between conditional and interventional distributions, and how to estimate interventional distributions with PyMC. As we said above, we can use the $\operatorname{do}$ operator to estimate interventional distributions. So let's dive in and see how that works.
 
-+++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
 ## Interventions and the $\operatorname{do}$ operator
 
@@ -253,7 +249,7 @@ So in summary, this is pretty cool. We can use the $\operatorname{do}$ operator
 
 For those wanting further background information on the $\operatorname{do}$ operator, explained from a different angle, readers should check out the richly diagrammed and well-explained blog post [Causal Effects via the Do-operator](https://towardsdatascience.com/causal-effects-via-the-do-operator-5415aefc834a) {cite:p}`Talebi2022dooperator`, the popular science book by {cite:t}`pearl2018why`, or the textbook by {cite:t}`molak2023ciadip`.
 
-+++ {"editable": true, "raw_mimetype": "", "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "raw_mimetype": "", "slideshow": {"slide_type": ""}}
 
 ## Three different causal DAGs
 
@@ -298,7 +294,7 @@ g.edge(tail_name="z", head_name="y")
 g
 ```
 
-+++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
 We can also imagine implementing such causal DAGs in Python code to generate `N` random numbers. Each of these will give rise to specific joint distributions, $P(x, y)$, and in fact, because Ferenc Huszár was clever in his blog post, we'll see later that these will all give rise to the same joint distributions.
 
@@ -330,7 +326,7 @@ x = z
 These code snippets are important because they define identical joint distributions $P(x,y)$ but they have different DAG structures. Therefore they may respond differently when it comes to making an intervention with the $\operatorname{do}$ operator. It is worth referring back to these code snippets to make sure you understand how they relate to the DAG structures above and to think through how making interventions on variables will affect the values of each of the variables $x, y, z$ if at all.
 :::
 
-+++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
 However, we are going to implement these using Bayesian causal DAGs with PyMC. Let's see how we can do this, then generate samples from them using `pm.sample_prior_predictive`. As we go with each DAG, we'll extract the samples for plotting later, and also plot the graphviz representation of the PyMC models. You'll see that while these are a fraction more visually complex, they do actually match up with the causal DAGs we've specified above.
 
@@ -339,7 +335,6 @@ However, we are going to implement these using Bayesian causal DAGs with PyMC. L
 editable: true
 slideshow:
   slide_type: ''
-tags: []
 ---
 # number of samples to generate
 N = 1_000_000
@@ -350,7 +345,6 @@ N = 1_000_000
 editable: true
 slideshow:
   slide_type: ''
-tags: []
 ---
 with pm.Model() as model1:
     x = pm.Normal("x")
@@ -368,7 +362,6 @@ pm.model_to_graphviz(model1)
 editable: true
 slideshow:
   slide_type: ''
-tags: []
 ---
 with pm.Model() as model2:
     y = pm.Normal("y", mu=1, sigma=2)
@@ -386,7 +379,6 @@ pm.model_to_graphviz(model2)
 editable: true
 slideshow:
   slide_type: ''
-tags: []
 ---
 with pm.Model() as model3:
     z = pm.Normal("z")
@@ -400,7 +392,7 @@ ds3 = az.extract(idata3.prior, var_names=["x", "y"])
 pm.model_to_graphviz(model3)
 ```
 
-+++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
 ### Joint distributions, $P(x,y)$
 
@@ -435,11 +427,11 @@ for i, ds in enumerate([ds1, ds2, ds3]):
     ax[i].axvline(x=2, ls="--", c="k")
 ```
 
-+++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
 At this point we have met 3 different data generating processes (and their corresponding DAGs). We've drawn many MCMC samples from the prior distribution and visualised this joint distribution $P(x,y)$ for each of the models. We are now in position to recap the conditional distributions (e.g. $P(y|x=2$, see the next section) and how they compare to the interventional distribution $P(y|\operatorname{do}=2)$ in the section following that.
 
-+++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
 ### Conditional distributions, $P(y|x=2)$
 
@@ -452,7 +444,6 @@ In the MCMC spirit of representing probability distributions by samples, let's n
 editable: true
 slideshow:
   slide_type: ''
-tags: []
 ---
 # Extract samples from P(y|x≈2)
 conditional1 = ds1.query(sample="1.99 < x < 2.01")["y"]
@@ -462,7 +453,7 @@ conditional3 = ds3.query(sample="1.99 < x < 2.01")["y"]
 
 So now we've got our MCMC estimates of $P(y|x=2)$ for all of the DAGs. But you're going to have to wait just a moment before we plot them. Let's move on to calculate $P(y|\operatorname{do}(x=2))$ and then plot them in one go so we can compare.
 
-+++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
 ### Interventional distributions, $P(y|\operatorname{do}(x=2))$
 
@@ -473,7 +464,6 @@ In turn for each of the 3 DAGs, let's use the $\operatorname{do}$ operator, sett
 editable: true
 slideshow:
   slide_type: ''
-tags: []
 ---
 model1_do = do(model1, {"x": 2})
 pm.model_to_graphviz(model1_do)
@@ -493,11 +483,11 @@ model3_do = do(model3, {"x": 2})
 pm.model_to_graphviz(model3_do)
 ```
 
-+++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
 So we can see that in DAG 1, the $x$ variable still has causal influence on $y$. However, in DAGs 2 and 3, $y$ is no longer causally influenced by $x$. So in DAGs 2 and 3, our intervention $\operatorname{do}(x=2)$ have no influence on $y$.
 
-+++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
 Next we'll sample from each of these interventional distributions. Note that we are using the mutilated models, `model1_do`, `model2_do`, and `model3_do`. 
 
@@ -506,7 +496,6 @@ Next we'll sample from each of these interventional distributions. Note that we
 editable: true
 slideshow:
   slide_type: ''
-tags: []
 ---
 with model1_do:
     idata1_do = pm.sample_prior_predictive(samples=N, random_seed=rng)
@@ -518,7 +507,7 @@ with model3_do:
     idata3_do = pm.sample_prior_predictive(samples=N, random_seed=rng)
 ```
 
-+++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
 So let's compare the conditional and interventional distributions for all 3 DAGs.
 
@@ -551,7 +540,7 @@ az.plot_density(
 ax[1].set(xlabel="y", title="Interventional distributions\n$P(y|\\operatorname{do}(x=2))$");
 ```
 
-+++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
 We can see, as expected, that the conditional distributions are the same for all 3 DAGs. Note that these distributions are not posterior distributions of estimated parameters - we have not conducted any parameter estimation here.
 
@@ -572,7 +561,7 @@ g.node(name="x2", label="x")
 g
 ```
 
-+++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
 The mutated DAG 3 is shown below. We can see that for this DAG, $P(y|\operatorname{do}(x=2)) = P(y|z)$. 
 
@@ -591,11 +580,11 @@ g.edge(tail_name="z", head_name="y")
 g
 ```
 
-+++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
 $P(y|\operatorname{do}(x=2))$ for DAG 2 and DAG 3 will actually be the same in this contrived example because the details were arranged to arrive at the same marginal distribution $P(y)$ for all DAGs.
 
-+++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
 ## Summary
 
@@ -607,7 +596,7 @@ The exciting thing is that there are many more causal reasoning ideas and concep
 
 Readers looking to learn more are suggested to check out the cited blog posts as well as textbooks, {cite:t}`pearl2000causality`, {cite:t}`pearl2016causal`, {cite:t}`mcelreath2018statistical`, {cite:t}`molak2023ciadip`.
 
-+++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
 ## Authors
 - Authored by [Benjamin T. Vincent](https://github.com/drbenvincent) in July 2023
@@ -629,6 +618,6 @@ Readers looking to learn more are suggested to check out the cited blog posts as
 %watermark -n -u -v -iv -w -p pytensor,xarray
 ```
 
-+++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}
++++ {"editable": true, "slideshow": {"slide_type": ""}}
 
 :::{include} ../page_footer.md :::
diff --git a/examples/causal_inference/regression_discontinuity.myst.md b/examples/causal_inference/regression_discontinuity.myst.md
index 26736574f..018a4265a 100644
--- a/examples/causal_inference/regression_discontinuity.myst.md
+++ b/examples/causal_inference/regression_discontinuity.myst.md
@@ -96,8 +96,6 @@ def plot_data(df):
 plot_data(df);
 ```
 
-+++ {"tags": []}
-
 ## Sharp regression discontinuity model
 
 We can define our Bayesian regression discontinuity model as:
@@ -158,7 +156,7 @@ az.plot_posterior(
 
 The most important thing is the posterior over the `effect` parameter. We can use that to base a judgement about the strength of the effect (e.g. through the 95% credible interval) or the presence/absence of an effect (e.g. through a Bayes Factor or ROPE).
 
-+++ {"tags": []}
++++
 
 ## Counterfactual questions
 
@@ -169,8 +167,6 @@ We can use posterior prediction to ask what would we expect to see if:
 _Technical note:_ Formally we are doing posterior prediction of `y`. Running `pm.sample_posterior_predictive` multiple times with different random seeds will result in new and different samples of `y` each time. However, this is not the case (we are not formally doing posterior prediction) for `mu`. This is because `mu` is a deterministic function (`mu = x + delta*treated`), so for our already obtained posterior samples of `delta`, the values of `mu` will be entirely determined by the values of `x` and `treated` data).
 
 ```{code-cell} ipython3
-:tags: []
-
 # MODEL EXPECTATION WITHOUT TREATMENT ------------------------------------
 # probe data
 _x = np.linspace(np.min(df.x), np.max(df.x), 500)
diff --git a/examples/diagnostics_and_criticism/model_averaging.myst.md b/examples/diagnostics_and_criticism/model_averaging.myst.md
index 413564b24..61a37dda4 100644
--- a/examples/diagnostics_and_criticism/model_averaging.myst.md
+++ b/examples/diagnostics_and_criticism/model_averaging.myst.md
@@ -27,7 +27,6 @@ papermill:
   exception: false
   start_time: '2020-11-29T12:13:02.878264'
   status: completed
-tags: []
 ---
 import arviz as az
 import matplotlib.pyplot as plt
@@ -46,14 +45,13 @@ papermill:
   exception: false
   start_time: '2020-11-29T12:13:07.836201'
   status: completed
-tags: []
 ---
 RANDOM_SEED = 8927
 np.random.seed(RANDOM_SEED)
 az.style.use("arviz-darkgrid")
 ```
 
-+++ {"papermill": {"duration": 0.068882, "end_time": "2020-11-29T12:13:08.020372", "exception": false, "start_time": "2020-11-29T12:13:07.951490", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.068882, "end_time": "2020-11-29T12:13:08.020372", "exception": false, "start_time": "2020-11-29T12:13:07.951490", "status": "completed"}}
 
 When confronted with more than one model we have several options. One of them is to perform model selection, using for example a given Information Criterion as exemplified the PyMC examples {ref}`pymc:model_comparison` and the {ref}`GLM-model-selection`. Model selection is appealing for its simplicity, but we are discarding information about the uncertainty in our models. This is somehow similar to computing the full posterior and then just keep a point-estimate like the posterior mean; we may become overconfident of what we really know. You can also browse the {doc}`blog/tag/model-comparison` tag to find related posts. 
 
@@ -108,7 +106,6 @@ papermill:
   exception: false
   start_time: '2020-11-29T12:13:08.081202'
   status: completed
-tags: []
 ---
 d = pd.read_csv(
     "https://raw.githubusercontent.com/pymc-devs/resources/master/Rethinking_2/Data/milk.csv",
@@ -121,7 +118,7 @@ d.iloc[:, 1:] = d.iloc[:, 1:] - d.iloc[:, 1:].mean()
 d.head()
 ```
 
-+++ {"papermill": {"duration": 0.048113, "end_time": "2020-11-29T12:13:09.292526", "exception": false, "start_time": "2020-11-29T12:13:09.244413", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.048113, "end_time": "2020-11-29T12:13:09.292526", "exception": false, "start_time": "2020-11-29T12:13:09.244413", "status": "completed"}}
 
 Now that we have the data we are going to build our first model using only the `neocortex`.
 
@@ -133,7 +130,6 @@ papermill:
   exception: false
   start_time: '2020-11-29T12:13:09.340679'
   status: completed
-tags: []
 ---
 with pm.Model() as model_0:
     alpha = pm.Normal("alpha", mu=0, sigma=10)
@@ -146,7 +142,7 @@ with pm.Model() as model_0:
     trace_0 = pm.sample(2000, return_inferencedata=True)
 ```
 
-+++ {"papermill": {"duration": 0.049578, "end_time": "2020-11-29T12:14:25.401979", "exception": false, "start_time": "2020-11-29T12:14:25.352401", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.049578, "end_time": "2020-11-29T12:14:25.401979", "exception": false, "start_time": "2020-11-29T12:14:25.352401", "status": "completed"}}
 
 The second model is exactly the same as the first one, except we now use the logarithm of the mass
 
@@ -158,7 +154,6 @@ papermill:
   exception: false
   start_time: '2020-11-29T12:14:25.450888'
   status: completed
-tags: []
 ---
 with pm.Model() as model_1:
     alpha = pm.Normal("alpha", mu=0, sigma=10)
@@ -172,7 +167,7 @@ with pm.Model() as model_1:
     trace_1 = pm.sample(2000, return_inferencedata=True)
 ```
 
-+++ {"papermill": {"duration": 0.049839, "end_time": "2020-11-29T12:14:34.547268", "exception": false, "start_time": "2020-11-29T12:14:34.497429", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.049839, "end_time": "2020-11-29T12:14:34.547268", "exception": false, "start_time": "2020-11-29T12:14:34.497429", "status": "completed"}}
 
 And finally the third model using the `neocortex` and `log_mass` variables
 
@@ -184,7 +179,6 @@ papermill:
   exception: false
   start_time: '2020-11-29T12:14:34.597234'
   status: completed
-tags: []
 ---
 with pm.Model() as model_2:
     alpha = pm.Normal("alpha", mu=0, sigma=10)
@@ -198,7 +192,7 @@ with pm.Model() as model_2:
     trace_2 = pm.sample(2000, return_inferencedata=True)
 ```
 
-+++ {"papermill": {"duration": 0.050236, "end_time": "2020-11-29T12:14:54.072799", "exception": false, "start_time": "2020-11-29T12:14:54.022563", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.050236, "end_time": "2020-11-29T12:14:54.072799", "exception": false, "start_time": "2020-11-29T12:14:54.022563", "status": "completed"}}
 
 Now that we have sampled the posterior for the 3 models, we are going to compare them visually. One option is to use the `forestplot` function that supports plotting more than one trace.
 
@@ -210,13 +204,12 @@ papermill:
   exception: false
   start_time: '2020-11-29T12:14:54.123411'
   status: completed
-tags: []
 ---
 traces = [trace_0, trace_1, trace_2]
 az.plot_forest(traces, figsize=(10, 5));
 ```
 
-+++ {"papermill": {"duration": 0.052958, "end_time": "2020-11-29T12:14:55.196722", "exception": false, "start_time": "2020-11-29T12:14:55.143764", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.052958, "end_time": "2020-11-29T12:14:55.196722", "exception": false, "start_time": "2020-11-29T12:14:55.143764", "status": "completed"}}
 
 Another option is to plot several traces in a same plot is to use `plot_density`. This plot is somehow similar to a forestplot, but we get truncated KDE (kernel density estimation) plots (by default 95% credible intervals) grouped by variable names together with a point estimate (by default the mean).
 
@@ -228,7 +221,6 @@ papermill:
   exception: false
   start_time: '2020-11-29T12:14:55.249276'
   status: completed
-tags: []
 ---
 ax = az.plot_density(
     traces,
@@ -246,7 +238,7 @@ ax[0, 1].set_ylabel("")
 ax[0, 1].set_title("95% Credible Intervals: sigma")
 ```
 
-+++ {"papermill": {"duration": 0.055089, "end_time": "2020-11-29T12:14:57.977616", "exception": false, "start_time": "2020-11-29T12:14:57.922527", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.055089, "end_time": "2020-11-29T12:14:57.977616", "exception": false, "start_time": "2020-11-29T12:14:57.922527", "status": "completed"}}
 
 Now that we have sampled the posterior for the 3 models, we are going to use WAIC (Widely applicable information criterion) to compare the 3 models. We can do this using the `compare` function included with ArviZ.
 
@@ -258,14 +250,13 @@ papermill:
   exception: false
   start_time: '2020-11-29T12:14:58.033914'
   status: completed
-tags: []
 ---
 model_dict = dict(zip(["model_0", "model_1", "model_2"], traces))
 comp = az.compare(model_dict)
 comp
 ```
 
-+++ {"papermill": {"duration": 0.056609, "end_time": "2020-11-29T12:14:58.387481", "exception": false, "start_time": "2020-11-29T12:14:58.330872", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.056609, "end_time": "2020-11-29T12:14:58.387481", "exception": false, "start_time": "2020-11-29T12:14:58.330872", "status": "completed"}}
 
 We can see that the best model is `model_2`, the one with both predictor variables. Notice the DataFrame is ordered from lowest to highest WAIC (_i.e_ from _better_ to _worst_ model). Check the {ref}`pymc:model_comparison` for a more detailed discussion on model comparison.
 
@@ -281,7 +272,6 @@ papermill:
   exception: false
   start_time: '2020-11-29T12:14:58.444313'
   status: completed
-tags: []
 ---
 ppc_w = pm.sample_posterior_predictive_w(
     traces=traces,
@@ -291,7 +281,7 @@ ppc_w = pm.sample_posterior_predictive_w(
 )
 ```
 
-+++ {"papermill": {"duration": 0.058454, "end_time": "2020-11-29T12:15:30.024455", "exception": false, "start_time": "2020-11-29T12:15:29.966001", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.058454, "end_time": "2020-11-29T12:15:30.024455", "exception": false, "start_time": "2020-11-29T12:15:29.966001", "status": "completed"}}
 
 Notice that we are passing the weights ordered by their index. We are doing this because we pass `traces` and `models` ordered from model 0 to 2, but the computed weights are ordered from lowest to highest WAIC (or equivalently from larger to lowest weight). In summary, we must be sure that we are correctly pairing the weights and models.
 
@@ -305,12 +295,11 @@ papermill:
   exception: false
   start_time: '2020-11-29T12:15:30.082568'
   status: completed
-tags: []
 ---
 ppc_2 = pm.sample_posterior_predictive(trace=trace_2, model=model_2, progressbar=False)
 ```
 
-+++ {"papermill": {"duration": 0.058214, "end_time": "2020-11-29T12:15:55.404271", "exception": false, "start_time": "2020-11-29T12:15:55.346057", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.058214, "end_time": "2020-11-29T12:15:55.404271", "exception": false, "start_time": "2020-11-29T12:15:55.346057", "status": "completed"}}
 
 A simple way to compare both kind of predictions is to plot their mean and hpd interval.
 
@@ -322,7 +311,6 @@ papermill:
   exception: false
   start_time: '2020-11-29T12:15:55.462809'
   status: completed
-tags: []
 ---
 mean_w = ppc_w["kcal"].mean()
 hpd_w = az.hdi(ppc_w["kcal"].flatten())
@@ -341,7 +329,7 @@ plt.xlabel("kcal per g")
 plt.legend();
 ```
 
-+++ {"papermill": {"duration": 0.05969, "end_time": "2020-11-29T12:15:55.884685", "exception": false, "start_time": "2020-11-29T12:15:55.824995", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.05969, "end_time": "2020-11-29T12:15:55.884685", "exception": false, "start_time": "2020-11-29T12:15:55.824995", "status": "completed"}}
 
 As we can see the mean value is almost the same for both predictions but the uncertainty in the weighted model is larger. We have effectively propagated the uncertainty about which model we should select to the posterior predictive samples. You can now try with the other two methods for computing weights `stacking` (the default and recommended method) and `pseudo-BMA`.
 
@@ -382,7 +370,6 @@ papermill:
   exception: false
   start_time: '2020-11-29T12:16:06.264642'
   status: completed
-tags: []
 ---
 %load_ext watermark
 %watermark -n -u -v -iv -w
diff --git a/examples/gaussian_processes/GP-MeansAndCovs.myst.md b/examples/gaussian_processes/GP-MeansAndCovs.myst.md
index ac0e8ffb6..cb72b478f 100644
--- a/examples/gaussian_processes/GP-MeansAndCovs.myst.md
+++ b/examples/gaussian_processes/GP-MeansAndCovs.myst.md
@@ -31,7 +31,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:36:26.280834'
   status: completed
-tags: []
 ---
 import arviz as az
 import matplotlib.cm as cmap
@@ -51,7 +50,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:36:31.626925'
   status: completed
-tags: []
 ---
 RANDOM_SEED = 8927
 
@@ -60,7 +58,7 @@ az.style.use("arviz-darkgrid")
 plt.rcParams["figure.figsize"] = (10, 4)
 ```
 
-+++ {"papermill": {"duration": 0.037844, "end_time": "2020-12-22T18:36:31.751886", "exception": false, "start_time": "2020-12-22T18:36:31.714042", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.037844, "end_time": "2020-12-22T18:36:31.751886", "exception": false, "start_time": "2020-12-22T18:36:31.714042", "status": "completed"}}
 
 A large set of mean and covariance functions are available in PyMC.  It is relatively easy to define custom mean and covariance functions.  Since PyMC uses PyTensor, their gradients do not need to be defined by the user.  
 
@@ -84,7 +82,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:36:31.790061'
   status: completed
-tags: []
 ---
 zero_func = pm.gp.mean.Zero()
 
@@ -92,7 +89,7 @@ X = np.linspace(0, 1, 5)[:, None]
 print(zero_func(X).eval())
 ```
 
-+++ {"papermill": {"duration": 0.040891, "end_time": "2020-12-22T18:36:32.947028", "exception": false, "start_time": "2020-12-22T18:36:32.906137", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.040891, "end_time": "2020-12-22T18:36:32.947028", "exception": false, "start_time": "2020-12-22T18:36:32.906137", "status": "completed"}}
 
 The default mean functions for all GP implementations in PyMC is `Zero`.
 
@@ -108,14 +105,13 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:36:32.988259'
   status: completed
-tags: []
 ---
 const_func = pm.gp.mean.Constant(25.2)
 
 print(const_func(X).eval())
 ```
 
-+++ {"papermill": {"duration": 0.039627, "end_time": "2020-12-22T18:36:35.195057", "exception": false, "start_time": "2020-12-22T18:36:35.155430", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.039627, "end_time": "2020-12-22T18:36:35.195057", "exception": false, "start_time": "2020-12-22T18:36:35.155430", "status": "completed"}}
 
 As long as the shape matches the input it will receive, `gp.mean.Constant` can also accept a PyTensor tensor or vector of PyMC random variables.
 
@@ -127,14 +123,13 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:36:35.235931'
   status: completed
-tags: []
 ---
 const_func_vec = pm.gp.mean.Constant(pt.ones(5))
 
 print(const_func_vec(X).eval())
 ```
 
-+++ {"papermill": {"duration": 0.04127, "end_time": "2020-12-22T18:36:36.726017", "exception": false, "start_time": "2020-12-22T18:36:36.684747", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.04127, "end_time": "2020-12-22T18:36:36.726017", "exception": false, "start_time": "2020-12-22T18:36:36.684747", "status": "completed"}}
 
 ### Linear
 
@@ -148,7 +143,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:36:36.765472'
   status: completed
-tags: []
 ---
 beta = rng.normal(size=3)
 b = 0.0
@@ -159,7 +153,7 @@ X = rng.normal(size=(5, 3))
 print(lin_func(X).eval())
 ```
 
-+++ {"papermill": {"duration": 0.03931, "end_time": "2020-12-22T18:36:36.918672", "exception": false, "start_time": "2020-12-22T18:36:36.879362", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.03931, "end_time": "2020-12-22T18:36:36.918672", "exception": false, "start_time": "2020-12-22T18:36:36.879362", "status": "completed"}}
 
 ## Defining a custom mean function
 
@@ -181,13 +175,13 @@ class Constant(pm.gp.mean.Mean):
 
 Remember that PyTensor must be used instead of NumPy.
 
-+++ {"papermill": {"duration": 0.039306, "end_time": "2020-12-22T18:36:36.998649", "exception": false, "start_time": "2020-12-22T18:36:36.959343", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.039306, "end_time": "2020-12-22T18:36:36.998649", "exception": false, "start_time": "2020-12-22T18:36:36.959343", "status": "completed"}}
 
 ## Covariance functions
 
 PyMC contains a much larger suite of {mod}`built-in covariance functions <pymc.gp.cov>`.  The following shows functions drawn from a GP prior with a given covariance function, and demonstrates how composite covariance functions can be constructed with Python operators in a straightforward manner.  Our goal was for our API to follow kernel algebra (see Ch.4 of {cite:t}`rasmussen2003gaussian`) as closely as possible.  See the main documentation page for an overview on their usage in PyMC.
 
-+++ {"papermill": {"duration": 0.039789, "end_time": "2020-12-22T18:36:37.078199", "exception": false, "start_time": "2020-12-22T18:36:37.038410", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.039789, "end_time": "2020-12-22T18:36:37.078199", "exception": false, "start_time": "2020-12-22T18:36:37.038410", "status": "completed"}}
 
 ### Exponentiated Quadratic
 
@@ -203,7 +197,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:36:37.121601'
   status: completed
-tags: []
 ---
 lengthscale = 0.2
 eta = 2.0
@@ -225,7 +218,7 @@ plt.ylabel("y")
 plt.xlabel("X");
 ```
 
-+++ {"papermill": {"duration": 0.042546, "end_time": "2020-12-22T18:36:44.712169", "exception": false, "start_time": "2020-12-22T18:36:44.669623", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.042546, "end_time": "2020-12-22T18:36:44.712169", "exception": false, "start_time": "2020-12-22T18:36:44.669623", "status": "completed"}}
 
 ### Two (and higher) Dimensional Inputs
 
@@ -241,7 +234,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:36:44.755778'
   status: completed
-tags: []
 ---
 x1 = np.linspace(0, 1, 10)
 x2 = np.arange(1, 4)
@@ -255,7 +247,7 @@ m = plt.imshow(cov(X2).eval(), cmap="inferno", interpolation="none")
 plt.colorbar(m);
 ```
 
-+++ {"papermill": {"duration": 0.043142, "end_time": "2020-12-22T18:36:48.032797", "exception": false, "start_time": "2020-12-22T18:36:47.989655", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.043142, "end_time": "2020-12-22T18:36:48.032797", "exception": false, "start_time": "2020-12-22T18:36:47.989655", "status": "completed"}}
 
 #### One dimension active
 
@@ -267,7 +259,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:36:48.076077'
   status: completed
-tags: []
 ---
 ls = 0.2
 cov = pm.gp.cov.ExpQuad(input_dim=2, ls=ls, active_dims=[0])
@@ -276,7 +267,7 @@ m = plt.imshow(cov(X2).eval(), cmap="inferno", interpolation="none")
 plt.colorbar(m);
 ```
 
-+++ {"papermill": {"duration": 0.045376, "end_time": "2020-12-22T18:36:48.840086", "exception": false, "start_time": "2020-12-22T18:36:48.794710", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.045376, "end_time": "2020-12-22T18:36:48.840086", "exception": false, "start_time": "2020-12-22T18:36:48.794710", "status": "completed"}}
 
 #### Product of covariances over different dimensions
 
@@ -290,7 +281,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:36:48.885155'
   status: completed
-tags: []
 ---
 ls1 = 0.2
 ls2 = 1.0
@@ -302,7 +292,7 @@ m = plt.imshow(cov(X2).eval(), cmap="inferno", interpolation="none")
 plt.colorbar(m);
 ```
 
-+++ {"papermill": {"duration": 0.046821, "end_time": "2020-12-22T18:36:50.579012", "exception": false, "start_time": "2020-12-22T18:36:50.532191", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.046821, "end_time": "2020-12-22T18:36:50.579012", "exception": false, "start_time": "2020-12-22T18:36:50.532191", "status": "completed"}}
 
 ### White Noise
 
@@ -318,7 +308,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:36:50.625370'
   status: completed
-tags: []
 ---
 sigma = 2.0
 cov = pm.gp.cov.WhiteNoise(sigma)
@@ -335,7 +324,7 @@ plt.ylabel("y")
 plt.xlabel("X");
 ```
 
-+++ {"papermill": {"duration": 0.05125, "end_time": "2020-12-22T18:36:51.723154", "exception": false, "start_time": "2020-12-22T18:36:51.671904", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.05125, "end_time": "2020-12-22T18:36:51.723154", "exception": false, "start_time": "2020-12-22T18:36:51.671904", "status": "completed"}}
 
 ### Constant
 
@@ -351,7 +340,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:36:51.774183'
   status: completed
-tags: []
 ---
 c = 2.0
 cov = pm.gp.cov.Constant(c)
@@ -370,7 +358,7 @@ plt.ylabel("y")
 plt.xlabel("X");
 ```
 
-+++ {"papermill": {"duration": 0.051694, "end_time": "2020-12-22T18:36:53.810105", "exception": false, "start_time": "2020-12-22T18:36:53.758411", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.051694, "end_time": "2020-12-22T18:36:53.810105", "exception": false, "start_time": "2020-12-22T18:36:53.758411", "status": "completed"}}
 
 ### Rational Quadratic
 
@@ -386,7 +374,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:36:53.863653'
   status: completed
-tags: []
 ---
 alpha = 0.1
 ls = 0.2
@@ -405,7 +392,7 @@ plt.ylabel("y")
 plt.xlabel("X");
 ```
 
-+++ {"papermill": {"duration": 0.055808, "end_time": "2020-12-22T18:36:56.357806", "exception": false, "start_time": "2020-12-22T18:36:56.301998", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.055808, "end_time": "2020-12-22T18:36:56.357806", "exception": false, "start_time": "2020-12-22T18:36:56.301998", "status": "completed"}}
 
 ### Exponential
 
@@ -421,7 +408,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:36:56.413112'
   status: completed
-tags: []
 ---
 inverse_lengthscale = 5
 cov = pm.gp.cov.Exponential(1, ls_inv=inverse_lengthscale)
@@ -438,7 +424,7 @@ plt.ylabel("y")
 plt.xlabel("X");
 ```
 
-+++ {"papermill": {"duration": 0.058891, "end_time": "2020-12-22T18:36:57.874371", "exception": false, "start_time": "2020-12-22T18:36:57.815480", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.058891, "end_time": "2020-12-22T18:36:57.874371", "exception": false, "start_time": "2020-12-22T18:36:57.815480", "status": "completed"}}
 
 ### Matern 5/2
 
@@ -456,7 +442,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:36:57.933356'
   status: completed
-tags: []
 ---
 ls = 0.2
 tau = 2.0
@@ -474,7 +459,7 @@ plt.ylabel("y")
 plt.xlabel("X");
 ```
 
-+++ {"papermill": {"duration": 0.061663, "end_time": "2020-12-22T18:37:00.473343", "exception": false, "start_time": "2020-12-22T18:37:00.411680", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.061663, "end_time": "2020-12-22T18:37:00.473343", "exception": false, "start_time": "2020-12-22T18:37:00.411680", "status": "completed"}}
 
 ### Matern 3/2
 
@@ -491,7 +476,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:37:00.534344'
   status: completed
-tags: []
 ---
 ls = 0.2
 tau = 2.0
@@ -509,7 +493,7 @@ plt.ylabel("y")
 plt.xlabel("X");
 ```
 
-+++ {"papermill": {"duration": 0.064186, "end_time": "2020-12-22T18:37:01.159126", "exception": false, "start_time": "2020-12-22T18:37:01.094940", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.064186, "end_time": "2020-12-22T18:37:01.159126", "exception": false, "start_time": "2020-12-22T18:37:01.094940", "status": "completed"}}
 
 ### Matern 1/2
 
@@ -523,7 +507,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:37:01.223834'
   status: completed
-tags: []
 ---
 ls = 0.2
 tau = 2.0
@@ -541,7 +524,7 @@ plt.ylabel("y")
 plt.xlabel("X");
 ```
 
-+++ {"papermill": {"duration": 0.068504, "end_time": "2020-12-22T18:37:01.837835", "exception": false, "start_time": "2020-12-22T18:37:01.769331", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.068504, "end_time": "2020-12-22T18:37:01.837835", "exception": false, "start_time": "2020-12-22T18:37:01.769331", "status": "completed"}}
 
 ### Cosine
 
@@ -557,7 +540,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:37:01.907064'
   status: completed
-tags: []
 ---
 period = 0.5
 cov = pm.gp.cov.Cosine(1, period)
@@ -576,7 +558,7 @@ plt.ylabel("y")
 plt.xlabel("X");
 ```
 
-+++ {"papermill": {"duration": 0.077444, "end_time": "2020-12-22T18:37:03.548722", "exception": false, "start_time": "2020-12-22T18:37:03.471278", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.077444, "end_time": "2020-12-22T18:37:03.548722", "exception": false, "start_time": "2020-12-22T18:37:03.471278", "status": "completed"}}
 
 ### Linear
 
@@ -592,7 +574,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:37:03.621125'
   status: completed
-tags: []
 ---
 c = 1.0
 tau = 2.0
@@ -612,7 +593,7 @@ plt.ylabel("y")
 plt.xlabel("X");
 ```
 
-+++ {"papermill": {"duration": 0.073236, "end_time": "2020-12-22T18:37:05.293217", "exception": false, "start_time": "2020-12-22T18:37:05.219981", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.073236, "end_time": "2020-12-22T18:37:05.293217", "exception": false, "start_time": "2020-12-22T18:37:05.219981", "status": "completed"}}
 
 ### Polynomial
 
@@ -628,7 +609,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:37:05.367470'
   status: completed
-tags: []
 ---
 c = 1.0
 d = 3
@@ -650,7 +630,7 @@ plt.ylabel("y")
 plt.xlabel("X");
 ```
 
-+++ {"papermill": {"duration": 0.07702, "end_time": "2020-12-22T18:37:06.892733", "exception": false, "start_time": "2020-12-22T18:37:06.815713", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.07702, "end_time": "2020-12-22T18:37:06.892733", "exception": false, "start_time": "2020-12-22T18:37:06.815713", "status": "completed"}}
 
 ### Multiplication with a precomputed covariance matrix
 
@@ -664,7 +644,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:37:06.968855'
   status: completed
-tags: []
 ---
 # first evaluate a covariance function into a matrix
 period = 0.2
@@ -686,7 +665,7 @@ plt.ylabel("y")
 plt.xlabel("X");
 ```
 
-+++ {"papermill": {"duration": 0.078461, "end_time": "2020-12-22T18:37:08.672218", "exception": false, "start_time": "2020-12-22T18:37:08.593757", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.078461, "end_time": "2020-12-22T18:37:08.672218", "exception": false, "start_time": "2020-12-22T18:37:08.593757", "status": "completed"}}
 
 ### Applying an arbitrary warping function on the inputs
 
@@ -702,7 +681,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:37:08.751821'
   status: completed
-tags: []
 ---
 def warp_func(x, a, b, c):
     return 1.0 + x + (a * pt.tanh(b * (x - c)))
@@ -735,7 +713,7 @@ plt.ylabel("y")
 plt.xlabel("X");
 ```
 
-+++ {"papermill": {"duration": 0.085228, "end_time": "2020-12-22T18:37:14.983640", "exception": false, "start_time": "2020-12-22T18:37:14.898412", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.085228, "end_time": "2020-12-22T18:37:14.983640", "exception": false, "start_time": "2020-12-22T18:37:14.898412", "status": "completed"}}
 
 ### Constructing `Periodic` using `WarpedInput`
 
@@ -760,7 +738,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:37:15.070404'
   status: completed
-tags: []
 ---
 def mapping(x, T):
     c = 2.0 * np.pi * (1.0 / T)
@@ -787,7 +764,7 @@ plt.ylabel("y")
 plt.xlabel("X");
 ```
 
-+++ {"papermill": {"duration": 0.089186, "end_time": "2020-12-22T18:37:18.877629", "exception": false, "start_time": "2020-12-22T18:37:18.788443", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.089186, "end_time": "2020-12-22T18:37:18.877629", "exception": false, "start_time": "2020-12-22T18:37:18.788443", "status": "completed"}}
 
 ### Periodic
 
@@ -801,7 +778,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:37:18.966476'
   status: completed
-tags: []
 ---
 period = 0.6
 ls = 0.4
@@ -822,7 +798,7 @@ plt.ylabel("y")
 plt.xlabel("X");
 ```
 
-+++ {"papermill": {"duration": 0.090578, "end_time": "2020-12-22T18:37:21.604122", "exception": false, "start_time": "2020-12-22T18:37:21.513544", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.090578, "end_time": "2020-12-22T18:37:21.604122", "exception": false, "start_time": "2020-12-22T18:37:21.513544", "status": "completed"}}
 
 ### Circular
 
@@ -855,7 +831,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:37:21.695696'
   status: completed
-tags: []
 ---
 period = 0.6
 tau = 4
@@ -874,7 +849,7 @@ plt.ylabel("y")
 plt.xlabel("X");
 ```
 
-+++ {"papermill": {"duration": 0.094257, "end_time": "2020-12-22T18:37:26.237410", "exception": false, "start_time": "2020-12-22T18:37:26.143153", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.094257, "end_time": "2020-12-22T18:37:26.237410", "exception": false, "start_time": "2020-12-22T18:37:26.143153", "status": "completed"}}
 
 We can see the effect of $\tau$, it adds more non-smooth patterns
 
@@ -886,7 +861,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:37:26.332697'
   status: completed
-tags: []
 ---
 period = 0.6
 tau = 40
@@ -905,7 +879,7 @@ plt.ylabel("y")
 plt.xlabel("X");
 ```
 
-+++ {"papermill": {"duration": 0.099739, "end_time": "2020-12-22T18:37:27.146953", "exception": false, "start_time": "2020-12-22T18:37:27.047214", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.099739, "end_time": "2020-12-22T18:37:27.146953", "exception": false, "start_time": "2020-12-22T18:37:27.047214", "status": "completed"}}
 
 ### Gibbs
 
@@ -919,7 +893,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:37:27.246895'
   status: completed
-tags: []
 ---
 def tanh_func(x, ls1, ls2, w, x0):
     """
@@ -955,7 +928,7 @@ plt.ylabel("y")
 plt.xlabel("X");
 ```
 
-+++ {"papermill": {"duration": 0.106362, "end_time": "2020-12-22T18:37:32.238582", "exception": false, "start_time": "2020-12-22T18:37:32.132220", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.106362, "end_time": "2020-12-22T18:37:32.238582", "exception": false, "start_time": "2020-12-22T18:37:32.132220", "status": "completed"}}
 
 ### Scaled Covariance
 
@@ -975,7 +948,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:37:32.343873'
   status: completed
-tags: []
 ---
 def logistic(x, a, x0, c, d):
     # a is the slope, x0 is the location
@@ -1010,7 +982,7 @@ plt.ylabel("y")
 plt.xlabel("X");
 ```
 
-+++ {"papermill": {"duration": 0.109017, "end_time": "2020-12-22T18:37:39.017681", "exception": false, "start_time": "2020-12-22T18:37:38.908664", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.109017, "end_time": "2020-12-22T18:37:39.017681", "exception": false, "start_time": "2020-12-22T18:37:38.908664", "status": "completed"}}
 
 ### Constructing a Changepoint kernel using `ScaledCov`
 
@@ -1033,7 +1005,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:37:39.126841'
   status: completed
-tags: []
 ---
 def logistic(x, a, x0):
     # a is the slope, x0 is the location
@@ -1080,7 +1051,7 @@ plt.ylabel("y")
 plt.xlabel("X");
 ```
 
-+++ {"papermill": {"duration": 0.123091, "end_time": "2020-12-22T18:37:41.801550", "exception": false, "start_time": "2020-12-22T18:37:41.678459", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.123091, "end_time": "2020-12-22T18:37:41.801550", "exception": false, "start_time": "2020-12-22T18:37:41.678459", "status": "completed"}}
 
 ### Combination of two or more Covariance functions
 
@@ -1092,7 +1063,7 @@ In particular, you can perform the following operations on any covaraince functi
 - Multiply with a scalar or a covariance function with equal or broadcastable dimensions with first covariance function
 - Exponentiate with a scalar.
 
-+++ {"papermill": {"duration": 0.114783, "end_time": "2020-12-22T18:37:42.043753", "exception": false, "start_time": "2020-12-22T18:37:41.928970", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.114783, "end_time": "2020-12-22T18:37:42.043753", "exception": false, "start_time": "2020-12-22T18:37:41.928970", "status": "completed"}}
 
 #### Addition
 
@@ -1104,7 +1075,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:37:42.157152'
   status: completed
-tags: []
 ---
 ls_1 = 0.1
 tau_1 = 2.0
@@ -1129,7 +1099,7 @@ plt.ylabel("y")
 plt.xlabel("X");
 ```
 
-+++ {"papermill": {"duration": 0.11646, "end_time": "2020-12-22T18:37:42.956319", "exception": false, "start_time": "2020-12-22T18:37:42.839859", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.11646, "end_time": "2020-12-22T18:37:42.956319", "exception": false, "start_time": "2020-12-22T18:37:42.839859", "status": "completed"}}
 
 #### Multiplication
 
@@ -1141,7 +1111,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:37:43.072966'
   status: completed
-tags: []
 ---
 ls_1 = 0.1
 tau_1 = 2.0
@@ -1166,7 +1135,7 @@ plt.ylabel("y")
 plt.xlabel("X");
 ```
 
-+++ {"papermill": {"duration": 0.125568, "end_time": "2020-12-22T18:37:43.873379", "exception": false, "start_time": "2020-12-22T18:37:43.747811", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.125568, "end_time": "2020-12-22T18:37:43.873379", "exception": false, "start_time": "2020-12-22T18:37:43.747811", "status": "completed"}}
 
 #### Exponentiation
 
@@ -1178,7 +1147,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:37:43.996275'
   status: completed
-tags: []
 ---
 ls_1 = 0.1
 tau_1 = 2.0
@@ -1201,7 +1169,7 @@ plt.ylabel("y")
 plt.xlabel("X");
 ```
 
-+++ {"papermill": {"duration": 0.124028, "end_time": "2020-12-22T18:37:44.770709", "exception": false, "start_time": "2020-12-22T18:37:44.646681", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.124028, "end_time": "2020-12-22T18:37:44.770709", "exception": false, "start_time": "2020-12-22T18:37:44.646681", "status": "completed"}}
 
 ### Defining a custom covariance function
 
@@ -1253,7 +1221,6 @@ papermill:
   exception: false
   start_time: '2020-12-22T18:37:54.811393'
   status: completed
-tags: []
 ---
 %load_ext watermark
 %watermark -n -u -v -iv -w -p aeppl,xarray
diff --git a/examples/gaussian_processes/GP-smoothing.ipynb b/examples/gaussian_processes/GP-smoothing.ipynb
index ade43eeb6..f19c2ef3e 100644
--- a/examples/gaussian_processes/GP-smoothing.ipynb
+++ b/examples/gaussian_processes/GP-smoothing.ipynb
@@ -538,7 +538,6 @@
     "fig, axes = subplots(2, 2)\n",
     "\n",
     "for ax, smoothing in zip(axes.ravel(), [0.95, 0.99, 0.999, 0.9999]):\n",
-    "\n",
     "    z_val = infer_z(smoothing)\n",
     "\n",
     "    ax.plot(x, y)\n",
diff --git a/examples/gaussian_processes/GP-smoothing.myst.md b/examples/gaussian_processes/GP-smoothing.myst.md
index ecde091a9..72a7b3f02 100644
--- a/examples/gaussian_processes/GP-smoothing.myst.md
+++ b/examples/gaussian_processes/GP-smoothing.myst.md
@@ -160,7 +160,6 @@ By increasing the smoothing parameter, we can gradually make the inferred values
 fig, axes = subplots(2, 2)
 
 for ax, smoothing in zip(axes.ravel(), [0.95, 0.99, 0.999, 0.9999]):
-
     z_val = infer_z(smoothing)
 
     ax.plot(x, y)
diff --git a/examples/gaussian_processes/MOGP-Coregion-Hadamard.myst.md b/examples/gaussian_processes/MOGP-Coregion-Hadamard.myst.md
index 665409b89..d18523526 100644
--- a/examples/gaussian_processes/MOGP-Coregion-Hadamard.myst.md
+++ b/examples/gaussian_processes/MOGP-Coregion-Hadamard.myst.md
@@ -203,8 +203,6 @@ X_new = np.hstack([X_new, output_idx])
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 %%time
 with model:
     preds = mogp.conditional("preds", X_new)
@@ -313,8 +311,6 @@ with model:
 ### Prediction
 
 ```{code-cell} ipython3
-:tags: []
-
 %%time
 with model:
     preds = mogp.conditional("preds", X_new)
diff --git a/examples/gaussian_processes/gaussian_process.ipynb b/examples/gaussian_processes/gaussian_process.ipynb
index a6f7d40fd..67880cca7 100644
--- a/examples/gaussian_processes/gaussian_process.ipynb
+++ b/examples/gaussian_processes/gaussian_process.ipynb
@@ -130,7 +130,6 @@
    "outputs": [],
    "source": [
     "with pm.Model() as gp_fit:\n",
-    "\n",
     "    mu = np.zeros(N)\n",
     "\n",
     "    eta_sq = pm.HalfCauchy(\"eta_sq\", 5)\n",
diff --git a/examples/gaussian_processes/gaussian_process.myst.md b/examples/gaussian_processes/gaussian_process.myst.md
index 51d3719f7..ea4edda51 100644
--- a/examples/gaussian_processes/gaussian_process.myst.md
+++ b/examples/gaussian_processes/gaussian_process.myst.md
@@ -94,7 +94,6 @@ squared_distance = lambda x, y: (x[None, :] - y[:, None]) ** 2
 
 ```{code-cell} ipython3
 with pm.Model() as gp_fit:
-
     mu = np.zeros(N)
 
     eta_sq = pm.HalfCauchy("eta_sq", 5)
diff --git a/examples/generalized_linear_models/GLM-binomial-regression.myst.md b/examples/generalized_linear_models/GLM-binomial-regression.myst.md
index 452fb610e..fb2774fe4 100644
--- a/examples/generalized_linear_models/GLM-binomial-regression.myst.md
+++ b/examples/generalized_linear_models/GLM-binomial-regression.myst.md
@@ -62,8 +62,6 @@ $$
 This defines our likelihood function. All you need now to get some Bayesian Binomial regression done is priors over the $\beta$ parameters. The observed data are $y_i$, $n$, and $x_i$.
 
 ```{code-cell} ipython3
-:tags: []
-
 import arviz as az
 import matplotlib.pyplot as plt
 import numpy as np
@@ -74,8 +72,6 @@ from scipy.special import expit
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 %config InlineBackend.figure_format = 'retina'
 az.style.use("arviz-darkgrid")
 rng = np.random.default_rng(1234)
@@ -252,8 +248,6 @@ A good introduction to generalized linear models is provided by {cite:t}`roback2
 ## Watermark
 
 ```{code-cell} ipython3
-:tags: []
-
 %load_ext watermark
 %watermark -n -u -v -iv -w -p pytensor,aeppl
 ```
diff --git a/examples/generalized_linear_models/GLM-hierarchical-binomial-model.ipynb b/examples/generalized_linear_models/GLM-hierarchical-binomial-model.ipynb
index 800be8eaf..32009e17c 100644
--- a/examples/generalized_linear_models/GLM-hierarchical-binomial-model.ipynb
+++ b/examples/generalized_linear_models/GLM-hierarchical-binomial-model.ipynb
@@ -140,17 +140,14 @@
     "\n",
     "\n",
     "def log_prior(A, B):\n",
-    "\n",
     "    return -5 / 2 * np.log(A + B)\n",
     "\n",
     "\n",
     "def trans_to_beta(x, y):\n",
-    "\n",
     "    return np.exp(y) / (np.exp(x) + 1)\n",
     "\n",
     "\n",
     "def trans_to_alpha(x, y):\n",
-    "\n",
     "    return np.exp(x) * trans_to_beta(x, y)\n",
     "\n",
     "\n",
diff --git a/examples/generalized_linear_models/GLM-hierarchical-binomial-model.myst.md b/examples/generalized_linear_models/GLM-hierarchical-binomial-model.myst.md
index 79eb08991..d42408df6 100644
--- a/examples/generalized_linear_models/GLM-hierarchical-binomial-model.myst.md
+++ b/examples/generalized_linear_models/GLM-hierarchical-binomial-model.myst.md
@@ -117,17 +117,14 @@ def log_likelihood(alpha, beta, y, n):
 
 
 def log_prior(A, B):
-
     return -5 / 2 * np.log(A + B)
 
 
 def trans_to_beta(x, y):
-
     return np.exp(y) / (np.exp(x) + 1)
 
 
 def trans_to_alpha(x, y):
-
     return np.exp(x) * trans_to_beta(x, y)
 
 
diff --git a/examples/generalized_linear_models/GLM-model-selection.ipynb b/examples/generalized_linear_models/GLM-model-selection.ipynb
index 67a4d4e4b..1b34ad073 100644
--- a/examples/generalized_linear_models/GLM-model-selection.ipynb
+++ b/examples/generalized_linear_models/GLM-model-selection.ipynb
@@ -723,7 +723,6 @@
     "    models, results = dict(), dict()\n",
     "\n",
     "    for k in range(1, upper_order + 1):\n",
-    "\n",
     "        nm = f\"k{k}\"\n",
     "        fml = create_poly_modelspec(k)\n",
     "\n",
diff --git a/examples/generalized_linear_models/GLM-model-selection.myst.md b/examples/generalized_linear_models/GLM-model-selection.myst.md
index 5ef3b5ea8..e0c40a19e 100644
--- a/examples/generalized_linear_models/GLM-model-selection.myst.md
+++ b/examples/generalized_linear_models/GLM-model-selection.myst.md
@@ -424,7 +424,6 @@ def run_models(df, upper_order=5):
     models, results = dict(), dict()
 
     for k in range(1, upper_order + 1):
-
         nm = f"k{k}"
         fml = create_poly_modelspec(k)
 
diff --git a/examples/generalized_linear_models/GLM-ordinal-regression.ipynb b/examples/generalized_linear_models/GLM-ordinal-regression.ipynb
index 70a7db1ec..f0b8be3ca 100644
--- a/examples/generalized_linear_models/GLM-ordinal-regression.ipynb
+++ b/examples/generalized_linear_models/GLM-ordinal-regression.ipynb
@@ -818,7 +818,6 @@
    "source": [
     "def make_model(priors, model_spec=1, constrained_uniform=False, logit=True):\n",
     "    with pm.Model() as model:\n",
-    "\n",
     "        if constrained_uniform:\n",
     "            cutpoints = constrainedUniform(K, 0, K)\n",
     "        else:\n",
@@ -3347,7 +3346,6 @@
     "\n",
     "def make_movies_model(ordered=False):\n",
     "    with pm.Model() as model:\n",
-    "\n",
     "        for g in movies_by_rating[\"movie_id\"].unique():\n",
     "            if ordered:\n",
     "                cutpoints = constrainedUniform(K, g, 0, K - 1)\n",
diff --git a/examples/generalized_linear_models/GLM-ordinal-regression.myst.md b/examples/generalized_linear_models/GLM-ordinal-regression.myst.md
index 833db1ab2..2f58be878 100644
--- a/examples/generalized_linear_models/GLM-ordinal-regression.myst.md
+++ b/examples/generalized_linear_models/GLM-ordinal-regression.myst.md
@@ -20,8 +20,6 @@ kernelspec:
 :::
 
 ```{code-cell} ipython3
-:tags: []
-
 import arviz as az
 import matplotlib.pyplot as plt
 import numpy as np
@@ -35,8 +33,6 @@ from statsmodels.miscmodels.ordinal_model import OrderedModel
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 %config InlineBackend.figure_format = 'retina'  # high resolution figures
 az.style.use("arviz-darkgrid")
 rng = np.random.default_rng(42)
@@ -55,8 +51,6 @@ We'll start by generating some fake data. Imagine an employee/manager relationsh
 Ordinal Regression is a statistical technique designed to **model** these kinds of relationships and can be contrasted to a design based approach where the focus is to extract simple statistical summaries e.g. proportions, counts or ratios in the context of a survey design under strong guarantees about the error tolerance in those derived summaries.
 
 ```{code-cell} ipython3
-:tags: []
-
 def make_data():
     np.random.seed(100)
     salary = np.random.normal(40, 10, 500)
@@ -92,8 +86,6 @@ df.head()
 We've specified our data in such a way that there is an underlying latent sentiment which is continuous in scale that gets crudely discretised to represent the ordinal rating scale. We've specified the data in such a way that salary drives a fairly linear increase in the manager's rating. 
 
 ```{code-cell} ipython3
-:tags: []
-
 fig, axs = plt.subplots(1, 3, figsize=(15, 5))
 axs = axs.flatten()
 ax = axs[0]
@@ -122,8 +114,6 @@ axs[1].legend();
 We can see here however that if we fit this model with a simple OLS fit it implies values beyond the categorical scale, which might motivate spurious salary increases by an overzealous HR manager. The OLS approximation is limited in that it cannot account for the proper nature of the outcome variable. 
 
 ```{code-cell} ipython3
-:tags: []
-
 exog = sm.add_constant(df[["salary", "work_from_home", "work_sat"]])
 mod = sm.OLS(df["explicit_rating"], exog)
 results = mod.fit()
@@ -189,8 +179,6 @@ We'll show both in this notebook, but as we'll see, the definition of the Dirchl
 PyMC has both `OrderedLogistic` and `OrderedProbit` distributions available. 
 
 ```{code-cell} ipython3
-:tags: []
-
 def constrainedUniform(N, min=0, max=1):
     return pm.Deterministic(
         "cutpoints",
@@ -211,7 +199,6 @@ The above function, (brainchild of Dr Ben Vincent and Adrian Seyboldt), looks a
 
 def make_model(priors, model_spec=1, constrained_uniform=False, logit=True):
     with pm.Model() as model:
-
         if constrained_uniform:
             cutpoints = constrainedUniform(K, 0, K)
         else:
@@ -252,14 +239,10 @@ idata5, model5 = make_model(priors, model_spec=3, constrained_uniform=True)
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 az.summary(idata3, var_names=["sigma", "cutpoints", "beta"])
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 pm.model_to_graphviz(model3)
 ```
 
@@ -268,21 +251,15 @@ pm.model_to_graphviz(model3)
 We can now for each individual manager's rating, look at the probability associated with each of the available categories. Across the posterior distributions of our cuts which section of the latent continous measure the employee is most likely to fall into.
 
 ```{code-cell} ipython3
-:tags: []
-
 implied_probs = az.extract(idata3, var_names=["y_probs"])
 implied_probs.shape
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 implied_probs[0].mean(axis=1)
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 fig, ax = plt.subplots(figsize=(20, 6))
 for i in range(K):
     ax.hist(implied_probs[0, i, :], label=f"Cutpoint: {i}", ec="white", bins=20, alpha=0.4)
@@ -292,23 +269,17 @@ ax.legend();
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 implied_class = az.extract(idata3, var_names=["y"], group="posterior_predictive")
 implied_class.shape
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 from scipy.stats import mode
 
 mode(implied_class[0])
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 fig, ax = plt.subplots(figsize=(20, 6))
 ax.hist(implied_class[0], ec="white", bins=20, alpha=0.4)
 ax.set_title("Distribution of Allocated Intervals for Individual O", fontsize=20);
@@ -321,8 +292,6 @@ ax.set_title("Distribution of Allocated Intervals for Individual O", fontsize=20
 One tool for ameliorating the risk of model misspecification is to compare amongst different candidate model to check for  predictive accuracy. 
 
 ```{code-cell} ipython3
-:tags: []
-
 compare = az.compare(
     {
         "model_salary": idata1,
@@ -340,8 +309,6 @@ compare
 We can also compare the estimated parameters which govern each regression model to see how robust our model fit is to alternative parameterisation. 
 
 ```{code-cell} ipython3
-:tags: []
-
 ax = az.plot_forest(
     [idata1, idata2, idata3, idata4, idata5],
     var_names=["sigma", "beta", "cutpoints"],
@@ -362,8 +329,6 @@ ax[0].set_title("Model Parameter Estimates", fontsize=20);
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 az.summary(idata3, var_names=["cutpoints", "beta", "sigma"])
 ```
 
@@ -372,8 +337,6 @@ az.summary(idata3, var_names=["cutpoints", "beta", "sigma"])
 Note how the model with unconstrianed cutpoints allows the occurence of a threshold estimated to be below zero. This does not make much conceptual sense, but can lead to a plausible enough posterior predictive distribution.
 
 ```{code-cell} ipython3
-:tags: []
-
 def plot_fit(idata):
     posterior = idata.posterior.stack(sample=("chain", "draw"))
     fig, axs = plt.subplots(1, 2, figsize=(20, 6))
@@ -394,14 +357,10 @@ plot_fit(idata3)
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 az.plot_posterior(idata3, var_names=["beta"]);
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 az.summary(idata3, var_names=["cutpoints"])
 ```
 
@@ -410,22 +369,16 @@ While the parameter estimates seem reasonable and the posterior predictive check
 However if we look at the model with the constrained Dirchlet prior: 
 
 ```{code-cell} ipython3
-:tags: []
-
 plot_fit(idata4)
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 az.plot_posterior(idata4, var_names=["beta"]);
 ```
 
 Again the parameters seem reasonable, and posterior predictive checks are sound. But now, having using the constrained uniform prior over the cutpoints our estimated cutpoints respect the definition of the ordinal scale. 
 
 ```{code-cell} ipython3
-:tags: []
-
 az.summary(idata4, var_names=["cutpoints"])
 ```
 
@@ -434,8 +387,6 @@ az.summary(idata4, var_names=["cutpoints"])
 This type of model can also be estimated in the frequentist tradition using maximum likelihood methods.
 
 ```{code-cell} ipython3
-:tags: []
-
 modf_logit = OrderedModel.from_formula(
     "explicit_rating ~ salary + work_sat + work_from_home", df, distr="logit"
 )
@@ -455,8 +406,6 @@ modf_logit.transform_threshold_params(resf_logit.params[-num_of_thresholds:])
 When we want to asses the implications of the model we can use the learned posterior estimates for the data generating equation, to simulate what proportions of survey results would have resulted in a rating over a particular threshold score. Here we allow for full uncertainty of the various beta-distributions to be represented under different working conditions and measure the proportion of employees who would give their manager a rating above a 7. 
 
 ```{code-cell} ipython3
-:tags: []
-
 betas_posterior = az.extract(idata4)["beta"]
 
 fig, ax = plt.subplots(figsize=(20, 10))
@@ -511,8 +460,6 @@ ax.legend();
 There are substantial reasons for using an ordinal regression model rather than trusting to alternative model specifications. For instance, the temptation to treat the ordered category as a continuous metric will lead to false inferences. The details are discussed in the Liddell and Kruschke paper {cite:p}`LIDDELL2018328` on this topic. We'll briefly replicate their example about how this phenomenon can appear in analysis of movies ratings data.
 
 ```{code-cell} ipython3
-:tags: []
-
 try:
     movies = pd.read_csv("../data/MoviesData.csv")
 except FileNotFoundError:
@@ -520,8 +467,6 @@ except FileNotFoundError:
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 def pivot_movie(row):
     row_ratings = row[["n1", "n2", "n3", "n4", "n5"]]
     totals = []
@@ -539,14 +484,10 @@ movies_by_rating.shape
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 movies_by_rating.sample(100).head()
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 def constrainedUniform(N, group, min=0, max=1):
     return pm.Deterministic(
         f"cutpoints_{group}",
@@ -574,7 +515,6 @@ priors = {"sigma": 1, "mu": [0, 1], "cut_mu": np.linspace(0, K, K - 1)}
 
 def make_movies_model(ordered=False):
     with pm.Model() as model:
-
         for g in movies_by_rating["movie_id"].unique():
             if ordered:
                 cutpoints = constrainedUniform(K, g, 0, K - 1)
@@ -610,8 +550,6 @@ idata_normal_metric, model_normal_metric = make_movies_model(ordered=False)
 This is a horrific fit to the movies rating data for six movies.
 
 ```{code-cell} ipython3
-:tags: []
-
 axs = az.plot_ppc(idata_normal_metric)
 axs = axs.flatten()
 for ax in axs:
@@ -623,8 +561,6 @@ for ax in axs:
 This shows a much nicer fit for each of the six movies. 
 
 ```{code-cell} ipython3
-:tags: []
-
 az.plot_ppc(idata_ordered);
 ```
 
@@ -635,28 +571,20 @@ Since this is real data and we don't know the true data generating process it's
 ### Compare Model Fits
 
 ```{code-cell} ipython3
-:tags: []
-
 y_5_compare = az.compare({"ordered": idata_ordered, "metric": idata_normal_metric}, var_name="y_5")
 y_5_compare
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 y_6_compare = az.compare({"ordered": idata_ordered, "metric": idata_normal_metric}, var_name="y_6")
 y_6_compare
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 az.plot_compare(y_5_compare)
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 az.plot_compare(y_6_compare)
 ```
 
@@ -665,8 +593,6 @@ az.plot_compare(y_6_compare)
 Aside from the predictive fits, the inferences drawn from the different modelling choices also vary quite significantly. Imagine being a movie executive trying to decide whether to commit to a sequel, then relative movie performance rating against competitor  benchmarks might be a pivotal feature of this decision, and difference induced by the analyst's choice of model can have an outsized effect on that choice. 
 
 ```{code-cell} ipython3
-:tags: []
-
 mosaic = """
 AC
 DE
diff --git a/examples/generalized_linear_models/GLM-poisson-regression.ipynb b/examples/generalized_linear_models/GLM-poisson-regression.ipynb
index c7c6c63bb..894a327e2 100644
--- a/examples/generalized_linear_models/GLM-poisson-regression.ipynb
+++ b/examples/generalized_linear_models/GLM-poisson-regression.ipynb
@@ -832,7 +832,6 @@
    "outputs": [],
    "source": [
     "with pm.Model() as mdl_fish:\n",
-    "\n",
     "    # define priors, weakly informative Normal\n",
     "    b0 = pm.Normal(\"Intercept\", mu=0, sigma=10)\n",
     "    b1 = pm.Normal(\"alcohol\", mu=0, sigma=10)\n",
diff --git a/examples/generalized_linear_models/GLM-poisson-regression.myst.md b/examples/generalized_linear_models/GLM-poisson-regression.myst.md
index 506944180..61ff5e02b 100644
--- a/examples/generalized_linear_models/GLM-poisson-regression.myst.md
+++ b/examples/generalized_linear_models/GLM-poisson-regression.myst.md
@@ -10,7 +10,7 @@ kernelspec:
   name: pymc-ex
 ---
 
-+++ {"papermill": {"duration": 0.043172, "end_time": "2021-02-23T11:26:55.064791", "exception": false, "start_time": "2021-02-23T11:26:55.021619", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.043172, "end_time": "2021-02-23T11:26:55.064791", "exception": false, "start_time": "2021-02-23T11:26:55.021619", "status": "completed"}}
 
 (GLM-poisson-regression)=
 # GLM: Poisson Regression
@@ -21,7 +21,7 @@ kernelspec:
 :author: Jonathan Sedar, Benjamin Vincent
 :::
 
-+++ {"papermill": {"duration": 0.069202, "end_time": "2021-02-23T11:27:01.489628", "exception": false, "start_time": "2021-02-23T11:27:01.420426", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.069202, "end_time": "2021-02-23T11:27:01.489628", "exception": false, "start_time": "2021-02-23T11:27:01.420426", "status": "completed"}}
 
 This is a minimal reproducible example of Poisson regression to predict counts using dummy data.
 
@@ -47,7 +47,6 @@ papermill:
   exception: false
   start_time: '2021-02-23T11:26:55.108848'
   status: completed
-tags: []
 ---
 import arviz as az
 import bambi as bmb
@@ -68,7 +67,6 @@ papermill:
   exception: false
   start_time: '2021-02-23T11:27:01.237926'
   status: completed
-tags: []
 ---
 RANDOM_SEED = 8927
 rng = np.random.default_rng(RANDOM_SEED)
@@ -77,15 +75,15 @@ rng = np.random.default_rng(RANDOM_SEED)
 az.style.use("arviz-darkgrid")
 ```
 
-+++ {"papermill": {"duration": 0.06268, "end_time": "2021-02-23T11:27:01.615645", "exception": false, "start_time": "2021-02-23T11:27:01.552965", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.06268, "end_time": "2021-02-23T11:27:01.615645", "exception": false, "start_time": "2021-02-23T11:27:01.552965", "status": "completed"}}
 
 ## Local Functions
 
-+++ {"papermill": {"duration": 0.073451, "end_time": "2021-02-23T11:27:01.763249", "exception": false, "start_time": "2021-02-23T11:27:01.689798", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.073451, "end_time": "2021-02-23T11:27:01.763249", "exception": false, "start_time": "2021-02-23T11:27:01.689798", "status": "completed"}}
 
 ## Generate Data
 
-+++ {"papermill": {"duration": 0.060542, "end_time": "2021-02-23T11:27:01.884617", "exception": false, "start_time": "2021-02-23T11:27:01.824075", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.060542, "end_time": "2021-02-23T11:27:01.884617", "exception": false, "start_time": "2021-02-23T11:27:01.824075", "status": "completed"}}
 
 This dummy dataset is created to emulate some data created as part of a study into quantified self, and the real data is more complicated than this. Ask Ian Osvald if you'd like to know more [@ianozvald](https://twitter.com/ianozsvald).
 
@@ -109,7 +107,6 @@ papermill:
   exception: false
   start_time: '2021-02-23T11:27:01.949653'
   status: completed
-tags: []
 ---
 # decide poisson theta values
 theta_noalcohol_meds = 1  # no alcohol, took an antihist
@@ -157,12 +154,11 @@ papermill:
   exception: false
   start_time: '2021-02-23T11:27:02.083286'
   status: completed
-tags: []
 ---
 df.tail()
 ```
 
-+++ {"papermill": {"duration": 0.071086, "end_time": "2021-02-23T11:27:02.312429", "exception": false, "start_time": "2021-02-23T11:27:02.241343", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.071086, "end_time": "2021-02-23T11:27:02.312429", "exception": false, "start_time": "2021-02-23T11:27:02.241343", "status": "completed"}}
 
 ##### View means of the various combinations (Poisson mean values)
 
@@ -174,12 +170,11 @@ papermill:
   exception: false
   start_time: '2021-02-23T11:27:02.367642'
   status: completed
-tags: []
 ---
 df.groupby(["alcohol", "nomeds"]).mean().unstack()
 ```
 
-+++ {"papermill": {"duration": 0.054583, "end_time": "2021-02-23T11:27:02.561633", "exception": false, "start_time": "2021-02-23T11:27:02.507050", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.054583, "end_time": "2021-02-23T11:27:02.561633", "exception": false, "start_time": "2021-02-23T11:27:02.507050", "status": "completed"}}
 
 ### Briefly Describe Dataset
 
@@ -191,7 +186,6 @@ papermill:
   exception: false
   start_time: '2021-02-23T11:27:02.613464'
   status: completed
-tags: []
 ---
 g = sns.catplot(
     x="nsneeze",
@@ -207,7 +201,7 @@ for ax in (g.axes[1, 0], g.axes[1, 1]):
         label.set_visible(n % 5 == 0)
 ```
 
-+++ {"papermill": {"duration": 0.049808, "end_time": "2021-02-23T11:27:05.231176", "exception": false, "start_time": "2021-02-23T11:27:05.181368", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.049808, "end_time": "2021-02-23T11:27:05.231176", "exception": false, "start_time": "2021-02-23T11:27:05.181368", "status": "completed"}}
 
 **Observe:**
 
@@ -217,15 +211,15 @@ for ax in (g.axes[1, 0], g.axes[1, 1]):
 + Changing `nomeds == True` (lower-left) increases the sneeze count `nsneeze` further
 + Changing both `alcohol == True and nomeds == True` (lower-right) increases the sneeze count `nsneeze` a lot, increasing both the mean and variance.
 
-+++ {"papermill": {"duration": 0.049476, "end_time": "2021-02-23T11:27:05.330914", "exception": false, "start_time": "2021-02-23T11:27:05.281438", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.049476, "end_time": "2021-02-23T11:27:05.330914", "exception": false, "start_time": "2021-02-23T11:27:05.281438", "status": "completed"}}
 
 ---
 
-+++ {"papermill": {"duration": 0.054536, "end_time": "2021-02-23T11:27:05.438038", "exception": false, "start_time": "2021-02-23T11:27:05.383502", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.054536, "end_time": "2021-02-23T11:27:05.438038", "exception": false, "start_time": "2021-02-23T11:27:05.383502", "status": "completed"}}
 
 ## Poisson Regression
 
-+++ {"papermill": {"duration": 0.048945, "end_time": "2021-02-23T11:27:05.540630", "exception": false, "start_time": "2021-02-23T11:27:05.491685", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.048945, "end_time": "2021-02-23T11:27:05.540630", "exception": false, "start_time": "2021-02-23T11:27:05.491685", "status": "completed"}}
 
 Our model here is a very simple Poisson regression, allowing for interaction of terms:
 
@@ -233,7 +227,7 @@ $$ \theta = exp(\beta X)$$
 
 $$ Y_{sneeze\_count} \sim Poisson(\theta)$$
 
-+++ {"papermill": {"duration": 0.04972, "end_time": "2021-02-23T11:27:05.641588", "exception": false, "start_time": "2021-02-23T11:27:05.591868", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.04972, "end_time": "2021-02-23T11:27:05.641588", "exception": false, "start_time": "2021-02-23T11:27:05.591868", "status": "completed"}}
 
 **Create linear model for interaction of terms**
 
@@ -245,7 +239,6 @@ papermill:
   exception: false
   start_time: '2021-02-23T11:27:05.691437'
   status: completed
-tags: []
 ---
 fml = "nsneeze ~ alcohol + nomeds + alcohol:nomeds"  # full formulae formulation
 ```
@@ -258,16 +251,15 @@ papermill:
   exception: false
   start_time: '2021-02-23T11:27:05.800805'
   status: completed
-tags: []
 ---
 fml = "nsneeze ~ alcohol * nomeds"  # lazy, alternative formulae formulation
 ```
 
-+++ {"papermill": {"duration": 0.048682, "end_time": "2021-02-23T11:27:05.958802", "exception": false, "start_time": "2021-02-23T11:27:05.910120", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.048682, "end_time": "2021-02-23T11:27:05.958802", "exception": false, "start_time": "2021-02-23T11:27:05.910120", "status": "completed"}}
 
 ### 1. Manual method, create design matrices and manually specify model
 
-+++ {"papermill": {"duration": 0.049076, "end_time": "2021-02-23T11:27:06.059305", "exception": false, "start_time": "2021-02-23T11:27:06.010229", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.049076, "end_time": "2021-02-23T11:27:06.059305", "exception": false, "start_time": "2021-02-23T11:27:06.010229", "status": "completed"}}
 
 **Create Design Matrices**
 
@@ -284,7 +276,7 @@ mx_en = dm.response.as_dataframe()
 mx_ex
 ```
 
-+++ {"papermill": {"duration": 0.062897, "end_time": "2021-02-23T11:27:06.420853", "exception": false, "start_time": "2021-02-23T11:27:06.357956", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.062897, "end_time": "2021-02-23T11:27:06.420853", "exception": false, "start_time": "2021-02-23T11:27:06.357956", "status": "completed"}}
 
 **Create Model**
 
@@ -296,10 +288,8 @@ papermill:
   exception: false
   start_time: '2021-02-23T11:27:06.483418'
   status: completed
-tags: []
 ---
 with pm.Model() as mdl_fish:
-
     # define priors, weakly informative Normal
     b0 = pm.Normal("Intercept", mu=0, sigma=10)
     b1 = pm.Normal("alcohol", mu=0, sigma=10)
@@ -318,7 +308,7 @@ with pm.Model() as mdl_fish:
     y = pm.Poisson("y", mu=pm.math.exp(theta), observed=mx_en["nsneeze"].values)
 ```
 
-+++ {"papermill": {"duration": 0.049445, "end_time": "2021-02-23T11:27:35.720870", "exception": false, "start_time": "2021-02-23T11:27:35.671425", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.049445, "end_time": "2021-02-23T11:27:35.720870", "exception": false, "start_time": "2021-02-23T11:27:35.671425", "status": "completed"}}
 
 **Sample Model**
 
@@ -330,14 +320,13 @@ papermill:
   exception: false
   start_time: '2021-02-23T11:27:35.769855'
   status: completed
-tags: []
 ---
 with mdl_fish:
     inf_fish = pm.sample()
     # inf_fish.extend(pm.sample_posterior_predictive(inf_fish))
 ```
 
-+++ {"papermill": {"duration": 0.118023, "end_time": "2021-02-23T11:29:24.142987", "exception": false, "start_time": "2021-02-23T11:29:24.024964", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.118023, "end_time": "2021-02-23T11:29:24.142987", "exception": false, "start_time": "2021-02-23T11:29:24.024964", "status": "completed"}}
 
 **View Diagnostics**
 
@@ -349,18 +338,17 @@ papermill:
   exception: false
   start_time: '2021-02-23T11:29:24.242675'
   status: completed
-tags: []
 ---
 az.plot_trace(inf_fish);
 ```
 
-+++ {"papermill": {"duration": 0.076462, "end_time": "2021-02-23T11:29:28.790410", "exception": false, "start_time": "2021-02-23T11:29:28.713948", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.076462, "end_time": "2021-02-23T11:29:28.790410", "exception": false, "start_time": "2021-02-23T11:29:28.713948", "status": "completed"}}
 
 **Observe:**
 
 + The model converges quickly and traceplots looks pretty well mixed
 
-+++ {"papermill": {"duration": 0.07685, "end_time": "2021-02-23T11:29:28.943674", "exception": false, "start_time": "2021-02-23T11:29:28.866824", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.07685, "end_time": "2021-02-23T11:29:28.943674", "exception": false, "start_time": "2021-02-23T11:29:28.866824", "status": "completed"}}
 
 ### Transform coeffs and recover theta values
 
@@ -368,7 +356,7 @@ az.plot_trace(inf_fish);
 az.summary(np.exp(inf_fish.posterior), kind="stats")
 ```
 
-+++ {"papermill": {"duration": 0.075014, "end_time": "2021-02-23T11:29:29.324266", "exception": false, "start_time": "2021-02-23T11:29:29.249252", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.075014, "end_time": "2021-02-23T11:29:29.324266", "exception": false, "start_time": "2021-02-23T11:29:29.249252", "status": "completed"}}
 
 **Observe:**
 
@@ -416,15 +404,15 @@ az.summary(np.exp(inf_fish.posterior), kind="stats")
                              = exp(Intercept) * exp(alcohol) * exp(nomeds * alcohol:nomeds)
                              = 1 * 3 * 6 * 2 = 36
 
-+++ {"papermill": {"duration": 0.076829, "end_time": "2021-02-23T11:29:29.477240", "exception": false, "start_time": "2021-02-23T11:29:29.400411", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.076829, "end_time": "2021-02-23T11:29:29.477240", "exception": false, "start_time": "2021-02-23T11:29:29.400411", "status": "completed"}}
 
 ### 2. Alternative method, using `bambi`
 
-+++ {"papermill": {"duration": 0.074408, "end_time": "2021-02-23T11:29:29.628052", "exception": false, "start_time": "2021-02-23T11:29:29.553644", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.074408, "end_time": "2021-02-23T11:29:29.628052", "exception": false, "start_time": "2021-02-23T11:29:29.553644", "status": "completed"}}
 
 **Create Model**
 
-+++ {"papermill": {"duration": 0.07467, "end_time": "2021-02-23T11:29:29.778406", "exception": false, "start_time": "2021-02-23T11:29:29.703736", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.07467, "end_time": "2021-02-23T11:29:29.778406", "exception": false, "start_time": "2021-02-23T11:29:29.703736", "status": "completed"}}
 
 **Alternative automatic formulation using `bambi`**
 
@@ -436,12 +424,11 @@ papermill:
   exception: false
   start_time: '2021-02-23T11:29:29.854648'
   status: completed
-tags: []
 ---
 model = bmb.Model(fml, df, family="poisson")
 ```
 
-+++ {"papermill": {"duration": 0.077285, "end_time": "2021-02-23T11:29:34.719403", "exception": false, "start_time": "2021-02-23T11:29:34.642118", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.077285, "end_time": "2021-02-23T11:29:34.719403", "exception": false, "start_time": "2021-02-23T11:29:34.642118", "status": "completed"}}
 
 **Fit Model**
 
@@ -453,12 +440,11 @@ papermill:
   exception: false
   start_time: '2021-02-23T11:29:34.796102'
   status: completed
-tags: []
 ---
 inf_fish_alt = model.fit()
 ```
 
-+++ {"papermill": {"duration": 0.075564, "end_time": "2021-02-23T11:31:30.375433", "exception": false, "start_time": "2021-02-23T11:31:30.299869", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.075564, "end_time": "2021-02-23T11:31:30.375433", "exception": false, "start_time": "2021-02-23T11:31:30.299869", "status": "completed"}}
 
 **View Traces**
 
@@ -470,12 +456,11 @@ papermill:
   exception: false
   start_time: '2021-02-23T11:31:30.453177'
   status: completed
-tags: []
 ---
 az.plot_trace(inf_fish_alt);
 ```
 
-+++ {"papermill": {"duration": 0.10274, "end_time": "2021-02-23T11:31:33.628707", "exception": false, "start_time": "2021-02-23T11:31:33.525967", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.10274, "end_time": "2021-02-23T11:31:33.628707", "exception": false, "start_time": "2021-02-23T11:31:33.525967", "status": "completed"}}
 
 ### Transform coeffs
 
@@ -483,7 +468,7 @@ az.plot_trace(inf_fish_alt);
 az.summary(np.exp(inf_fish_alt.posterior), kind="stats")
 ```
 
-+++ {"papermill": {"duration": 0.10059, "end_time": "2021-02-23T11:31:34.095731", "exception": false, "start_time": "2021-02-23T11:31:33.995141", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.10059, "end_time": "2021-02-23T11:31:34.095731", "exception": false, "start_time": "2021-02-23T11:31:33.995141", "status": "completed"}}
 
 **Observe:**
 
@@ -492,8 +477,6 @@ az.summary(np.exp(inf_fish_alt.posterior), kind="stats")
 + Note that the posterior predictive samples have an extreme skew
 
 ```{code-cell} ipython3
-:tags: []
-
 posterior_predictive = model.predict(inf_fish_alt, kind="pps")
 ```
 
@@ -505,7 +488,7 @@ For more information on posterior predictive checks, we can refer to {ref}`pymc:
 az.plot_ppc(inf_fish_alt);
 ```
 
-+++ {"papermill": {"duration": 0.106366, "end_time": "2021-02-23T11:31:34.956844", "exception": false, "start_time": "2021-02-23T11:31:34.850478", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.106366, "end_time": "2021-02-23T11:31:34.956844", "exception": false, "start_time": "2021-02-23T11:31:34.850478", "status": "completed"}}
 
 ## Authors
 - Example originally contributed by [Jonathan Sedar](https://github.com/jonsedar) 2016-05-15.
@@ -524,7 +507,6 @@ papermill:
   exception: false
   start_time: '2021-02-23T11:31:43.212087'
   status: completed
-tags: []
 ---
 %load_ext watermark
 %watermark -n -u -v -iv -w -p pytensor,aeppl
diff --git a/examples/generalized_linear_models/GLM-robust-with-outlier-detection.ipynb b/examples/generalized_linear_models/GLM-robust-with-outlier-detection.ipynb
index 593ec0603..f167e43f3 100644
--- a/examples/generalized_linear_models/GLM-robust-with-outlier-detection.ipynb
+++ b/examples/generalized_linear_models/GLM-robust-with-outlier-detection.ipynb
@@ -503,7 +503,6 @@
    "source": [
     "coords = {\"coefs\": [\"intercept\", \"slope\"], \"datapoint_id\": dfhoggs.index}\n",
     "with pm.Model(coords=coords) as mdl_ols:\n",
-    "\n",
     "    # Define weakly informative Normal priors to give Ridge regression\n",
     "    beta = pm.Normal(\"beta\", mu=0, sigma=10, dims=\"coefs\")\n",
     "\n",
@@ -726,7 +725,6 @@
    ],
    "source": [
     "with pm.Model(coords=coords) as mdl_studentt:\n",
-    "\n",
     "    # define weakly informative Normal priors to give Ridge regression\n",
     "    beta = pm.Normal(\"beta\", mu=0, sigma=10, dims=\"coefs\")\n",
     "\n",
@@ -1042,7 +1040,6 @@
    "outputs": [],
    "source": [
     "with pm.Model(coords=coords) as mdl_hogg:\n",
-    "\n",
     "    # state input data as Theano shared vars\n",
     "    tsv_x = pm.ConstantData(\"tsv_x\", dfhoggs[\"x\"], dims=\"datapoint_id\")\n",
     "    tsv_y = pm.ConstantData(\"tsv_y\", dfhoggs[\"y\"], dims=\"datapoint_id\")\n",
diff --git a/examples/generalized_linear_models/GLM-robust-with-outlier-detection.myst.md b/examples/generalized_linear_models/GLM-robust-with-outlier-detection.myst.md
index cbae9debf..203d1bf5f 100644
--- a/examples/generalized_linear_models/GLM-robust-with-outlier-detection.myst.md
+++ b/examples/generalized_linear_models/GLM-robust-with-outlier-detection.myst.md
@@ -262,7 +262,6 @@ where:
 ```{code-cell} ipython3
 coords = {"coefs": ["intercept", "slope"], "datapoint_id": dfhoggs.index}
 with pm.Model(coords=coords) as mdl_ols:
-
     # Define weakly informative Normal priors to give Ridge regression
     beta = pm.Normal("beta", mu=0, sigma=10, dims="coefs")
 
@@ -355,7 +354,6 @@ Note: the dataset also has `sigma_x` and `rho_xy` available, but for this exerci
 
 ```{code-cell} ipython3
 with pm.Model(coords=coords) as mdl_studentt:
-
     # define weakly informative Normal priors to give Ridge regression
     beta = pm.Normal("beta", mu=0, sigma=10, dims="coefs")
 
@@ -521,7 +519,6 @@ on `Potential` usage:
 
 ```{code-cell} ipython3
 with pm.Model(coords=coords) as mdl_hogg:
-
     # state input data as Theano shared vars
     tsv_x = pm.ConstantData("tsv_x", dfhoggs["x"], dims="datapoint_id")
     tsv_y = pm.ConstantData("tsv_y", dfhoggs["y"], dims="datapoint_id")
diff --git a/examples/generalized_linear_models/GLM-truncated-censored-regression.myst.md b/examples/generalized_linear_models/GLM-truncated-censored-regression.myst.md
index e8c1816b9..24fdb2dda 100644
--- a/examples/generalized_linear_models/GLM-truncated-censored-regression.myst.md
+++ b/examples/generalized_linear_models/GLM-truncated-censored-regression.myst.md
@@ -22,8 +22,6 @@ kernelspec:
 The notebook provides an example of how to conduct linear regression when your outcome variable is either censored or truncated.
 
 ```{code-cell} ipython3
-:tags: []
-
 from copy import copy
 
 import arviz as az
@@ -37,8 +35,6 @@ from scipy.stats import norm, truncnorm
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 %config InlineBackend.figure_format = 'retina'
 rng = default_rng(12345)
 az.style.use("arviz-darkgrid")
@@ -54,8 +50,6 @@ Truncation and censoring are examples of missing data problems. It can sometimes
 Let's further explore this with some code and plots. First we will generate some true `(x, y)` scatter data, where `y` is our outcome measure and `x` is some predictor variable.
 
 ```{code-cell} ipython3
-:tags: []
-
 slope, intercept, σ, N = 1, 0, 2, 200
 x = rng.uniform(-10, 10, N)
 y = rng.normal(loc=slope * x + intercept, scale=σ)
@@ -64,8 +58,6 @@ y = rng.normal(loc=slope * x + intercept, scale=σ)
 For this example of `(x, y)` scatter data, we can describe the truncation process as simply filtering out any data for which our outcome variable `y` falls outside of a set of bounds.
 
 ```{code-cell} ipython3
-:tags: []
-
 def truncate_y(x, y, bounds):
     keep = (y >= bounds[0]) & (y <= bounds[1])
     return (x[keep], y[keep])
@@ -74,8 +66,6 @@ def truncate_y(x, y, bounds):
 With censoring however, we are setting the `y` value equal to the bounds that they exceed.
 
 ```{code-cell} ipython3
-:tags: []
-
 def censor_y(x, y, bounds):
     cy = copy(y)
     cy[y <= bounds[0]] = bounds[0]
@@ -86,8 +76,6 @@ def censor_y(x, y, bounds):
 Based on our generated `(x, y)` data (which an experimenter would never see in real life), we can generate our actual observed datasets for truncated data `(xt, yt)` and censored data `(xc, yc)`.
 
 ```{code-cell} ipython3
-:tags: []
-
 bounds = [-5, 5]
 xt, yt = truncate_y(x, y, bounds)
 xc, yc = censor_y(x, y, bounds)
@@ -96,8 +84,6 @@ xc, yc = censor_y(x, y, bounds)
 We can visualise this latent data (in grey) and the remaining truncated or censored data (black) as below.
 
 ```{code-cell} ipython3
-:tags: []
-
 fig, axes = plt.subplots(1, 2, figsize=(10, 5))
 
 for ax in axes:
@@ -119,8 +105,6 @@ If we were to run regular linear regression on either the truncated or censored
 In this section we will run Bayesian linear regression on these datasets to see the extent of the problem. We start by defining a function which defines a PyMC model, conducts MCMC sampling, and returns the model and the MCMC sampling data.
 
 ```{code-cell} ipython3
-:tags: []
-
 def linear_regression(x, y):
     with pm.Model() as model:
         slope = pm.Normal("slope", mu=0, sigma=1)
@@ -134,8 +118,6 @@ def linear_regression(x, y):
 So we can run this on our truncated and our censored data, separately.
 
 ```{code-cell} ipython3
-:tags: []
-
 trunc_linear_model = linear_regression(xt, yt)
 
 with trunc_linear_model:
@@ -143,8 +125,6 @@ with trunc_linear_model:
 ```
 
 ```{code-cell} ipython3
-:tags: []
-
 cens_linear_model = linear_regression(xc, yc)
 
 with cens_linear_model:
@@ -154,8 +134,6 @@ with cens_linear_model:
 By plotting the posterior distribution over the slope parameters we can see that the estimates for the slope are pretty far off, so we are indeed underestimating the regression slope.
 
 ```{code-cell} ipython3
-:tags: []
-
 fig, ax = plt.subplots(1, 2, figsize=(10, 5), sharex=True)
 
 az.plot_posterior(trunc_linear_fit, var_names=["slope"], ref_val=slope, ax=ax[0])
@@ -168,8 +146,6 @@ ax[1].set(title="Linear regression\n(censored data)", xlabel="slope");
 To appreciate the extent of the problem (for this dataset) we can visualise the posterior predictive fits alongside the data.
 
 ```{code-cell} ipython3
-:tags: []
-
 def pp_plot(x, y, fit, ax):
     # plot data
     ax.plot(x, y, "k.")
@@ -209,8 +185,6 @@ Now we have seen the problem of conducting regression on truncated or censored d
 Truncated regression models are quite simple to implement. The normal likelihood is centered on the regression slope as normal, but now we just specify a normal distribution which is truncated at the bounds.
 
 ```{code-cell} ipython3
-:tags: []
-
 def truncated_regression(x, y, bounds):
     with pm.Model() as model:
         slope = pm.Normal("slope", mu=0, sigma=1)
@@ -224,8 +198,6 @@ def truncated_regression(x, y, bounds):
 Truncated regression solves the bias problem by updating the likelihood to reflect our knowledge about the process generating the observations. Namely, we have zero chance of observing any data outside of the truncation bounds, and so the likelihood should reflect this. We can visualise this in the plot below, where compared to a normal distribution, the probability density of a truncated normal is zero outside of the truncation bounds $(y<-1)$ in this case.
 
 ```{code-cell} ipython3
-:tags: []
-
 fig, ax = plt.subplots(figsize=(10, 3))
 y = np.linspace(-4, 4, 1000)
 ax.fill_between(y, norm.pdf(y, loc=0, scale=1), 0, alpha=0.2, ec="b", fc="b", label="Normal")
@@ -246,8 +218,6 @@ ax.legend();
 ### Censored regression model
 
 ```{code-cell} ipython3
-:tags: []
-
 def censored_regression(x, y, bounds):
     with pm.Model() as model:
         slope = pm.Normal("slope", mu=0, sigma=1)
@@ -292,8 +262,6 @@ ax.set(xlabel="$y$", ylabel="probability density", ylim=(-0.02, 0.4));
 Now we can conduct our parameter estimation with the truncated regression model on the truncated data...
 
 ```{code-cell} ipython3
-:tags: []
-
 truncated_model = truncated_regression(xt, yt, bounds)
 
 with truncated_model:
@@ -303,8 +271,6 @@ with truncated_model:
 and with the censored regression model on the censored data.
 
 ```{code-cell} ipython3
-:tags: []
-
 censored_model = censored_regression(xc, yc, bounds)
 
 with censored_model:
@@ -330,8 +296,6 @@ We could speculate then, that if an experimenter had the choice of truncating or
 Correspondingly, we can confirm the models are good through visual inspection of the posterior predictive plots.
 
 ```{code-cell} ipython3
-:tags: []
-
 fig, ax = plt.subplots(1, 2, figsize=(10, 5), sharex=True, sharey=True)
 
 pp_plot(xt, yt, truncated_fit, ax[0])
diff --git a/examples/howto/LKJ_correlation.py b/examples/howto/LKJ_correlation.py
index 7703dbe2a..7e54ff56f 100644
--- a/examples/howto/LKJ_correlation.py
+++ b/examples/howto/LKJ_correlation.py
@@ -29,7 +29,6 @@
 dataset = multivariate_normal(mu_r, cov_matrix, size=n_obs)
 
 with pm.Model() as model:
-
     mu = pm.Normal("mu", mu=0, sigma=1, shape=n_var)
 
     # Note that we access the distribution for the standard
diff --git a/examples/howto/custom_dists.py b/examples/howto/custom_dists.py
index 2e40dd233..f38766125 100644
--- a/examples/howto/custom_dists.py
+++ b/examples/howto/custom_dists.py
@@ -24,6 +24,7 @@
 ydata = np.random.normal(ydata, 10)
 data = {"x": xdata, "y": ydata}
 
+
 # define loglikelihood outside of the model context, otherwise cores wont work:
 # Lambdas defined in local namespace are not picklable (see issue #1995)
 def loglike1(value):
diff --git a/examples/howto/lasso_block_update.myst.md b/examples/howto/lasso_block_update.myst.md
index 6c918bef3..82db7bcd4 100644
--- a/examples/howto/lasso_block_update.myst.md
+++ b/examples/howto/lasso_block_update.myst.md
@@ -20,8 +20,6 @@ kernelspec:
 :::
 
 ```{code-cell} ipython3
-:tags: []
-
 %matplotlib inline
 import arviz as az
 import matplotlib.pyplot as plt
@@ -54,8 +52,6 @@ y_obs = x1 * 0.2 + x2 * 0.3 + noise
 Then define the random variables.
 
 ```{code-cell} ipython3
-:tags: []
-
 lam = 3000
 
 with pm.Model() as model:
@@ -84,8 +80,6 @@ with model:
 We conclude by plotting the sampled marginals and the joint distribution of `beta1` and `beta2`.
 
 ```{code-cell} ipython3
-:tags: []
-
 az.plot_trace(idata);
 ```
 
@@ -113,8 +107,6 @@ az.plot_pair(
 ## Watermark
 
 ```{code-cell} ipython3
-:tags: []
-
 %load_ext watermark
 %watermark -n -u -v -iv -w -p pytensor,aeppl,xarray
 ```
diff --git a/examples/howto/model_builder.myst.md b/examples/howto/model_builder.myst.md
index 3e74a27b8..dc6c4b4a3 100644
--- a/examples/howto/model_builder.myst.md
+++ b/examples/howto/model_builder.myst.md
@@ -94,8 +94,6 @@ from pymc_experimental.model_builder import ModelBuilder
 To define our desired model we inherit from the `ModelBuilder` class. There are a couple of methods we need to define.
 
 ```{code-cell} ipython3
-:tags: []
-
 class LinearModel(ModelBuilder):
     # Give the model a name
     _model_type = "LinearModel"
@@ -212,8 +210,6 @@ After fitting the model, we can probably save it to share the model as a file so
 To `save()` or `load()`, we can quickly call methods for respective tasks with the following syntax.
 
 ```{code-cell} ipython3
-:tags: []
-
 fname = "linear_model_v1.nc"
 model.save(fname)
 ```
diff --git a/examples/howto/sampling_callback.ipynb b/examples/howto/sampling_callback.ipynb
index 1e4414dae..4117d3652 100644
--- a/examples/howto/sampling_callback.ipynb
+++ b/examples/howto/sampling_callback.ipynb
@@ -31,7 +31,6 @@
     "X = np.array([1, 2, 3, 4, 5])\n",
     "y = X * 2 + np.random.randn(len(X))\n",
     "with pm.Model() as model:\n",
-    "\n",
     "    intercept = pm.Normal(\"intercept\", 0, 10)\n",
     "    slope = pm.Normal(\"slope\", 0, 10)\n",
     "\n",
diff --git a/examples/howto/sampling_callback.myst.md b/examples/howto/sampling_callback.myst.md
index adfcbb94e..bed3ff52b 100644
--- a/examples/howto/sampling_callback.myst.md
+++ b/examples/howto/sampling_callback.myst.md
@@ -31,7 +31,6 @@ import pymc3 as pm
 X = np.array([1, 2, 3, 4, 5])
 y = X * 2 + np.random.randn(len(X))
 with pm.Model() as model:
-
     intercept = pm.Normal("intercept", 0, 10)
     slope = pm.Normal("slope", 0, 10)
 
diff --git a/examples/howto/updating_priors.ipynb b/examples/howto/updating_priors.ipynb
index c0458cb31..8f9a7c99b 100644
--- a/examples/howto/updating_priors.ipynb
+++ b/examples/howto/updating_priors.ipynb
@@ -159,7 +159,6 @@
     "basic_model = Model()\n",
     "\n",
     "with basic_model:\n",
-    "\n",
     "    # Priors for unknown model parameters\n",
     "    alpha = Normal(\"alpha\", mu=0, sigma=1)\n",
     "    beta0 = Normal(\"beta0\", mu=12, sigma=1)\n",
@@ -655,7 +654,6 @@
    ],
    "source": [
     "for _ in range(10):\n",
-    "\n",
     "    # generate more data\n",
     "    X1 = np.random.randn(size)\n",
     "    X2 = np.random.randn(size) * 0.2\n",
diff --git a/examples/howto/updating_priors.myst.md b/examples/howto/updating_priors.myst.md
index d06cc3962..a94a8b464 100644
--- a/examples/howto/updating_priors.myst.md
+++ b/examples/howto/updating_priors.myst.md
@@ -72,7 +72,6 @@ Our initial beliefs about the parameters are quite informative (sigma=1) and a b
 basic_model = Model()
 
 with basic_model:
-
     # Priors for unknown model parameters
     alpha = Normal("alpha", mu=0, sigma=1)
     beta0 = Normal("beta0", mu=12, sigma=1)
@@ -118,7 +117,6 @@ traces = [trace]
 
 ```{code-cell} ipython3
 for _ in range(10):
-
     # generate more data
     X1 = np.random.randn(size)
     X2 = np.random.randn(size) * 0.2
diff --git a/examples/mixture_models/marginalized_gaussian_mixture_model.myst.md b/examples/mixture_models/marginalized_gaussian_mixture_model.myst.md
index fd47287dc..b04cb05ff 100644
--- a/examples/mixture_models/marginalized_gaussian_mixture_model.myst.md
+++ b/examples/mixture_models/marginalized_gaussian_mixture_model.myst.md
@@ -10,7 +10,7 @@ kernelspec:
   name: python3
 ---
 
-+++ {"papermill": {"duration": 0.012112, "end_time": "2020-12-20T20:45:32.375345", "exception": false, "start_time": "2020-12-20T20:45:32.363233", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.012112, "end_time": "2020-12-20T20:45:32.375345", "exception": false, "start_time": "2020-12-20T20:45:32.363233", "status": "completed"}}
 
 # Marginalized Gaussian Mixture Model
 
@@ -27,7 +27,6 @@ papermill:
   exception: false
   start_time: '2020-12-20T20:45:32.386198'
   status: completed
-tags: []
 ---
 import arviz as az
 import numpy as np
@@ -47,7 +46,6 @@ papermill:
   exception: false
   start_time: '2020-12-20T20:45:38.305815'
   status: completed
-tags: []
 ---
 %config InlineBackend.figure_format = 'retina'
 RANDOM_SEED = 8927
@@ -55,7 +53,7 @@ rng = np.random.default_rng(RANDOM_SEED)
 az.style.use("arviz-darkgrid")
 ```
 
-+++ {"papermill": {"duration": 0.011094, "end_time": "2020-12-20T20:45:38.362640", "exception": false, "start_time": "2020-12-20T20:45:38.351546", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.011094, "end_time": "2020-12-20T20:45:38.362640", "exception": false, "start_time": "2020-12-20T20:45:38.351546", "status": "completed"}}
 
 Gaussian mixtures are a flexible class of models for data that exhibits subpopulation heterogeneity.  A toy example of such a data set is shown below.
 
@@ -67,7 +65,6 @@ papermill:
   exception: false
   start_time: '2020-12-20T20:45:38.373873'
   status: completed
-tags: []
 ---
 N = 1000
 
@@ -85,7 +82,6 @@ papermill:
   exception: false
   start_time: '2020-12-20T20:45:38.403986'
   status: completed
-tags: []
 ---
 component = rng.choice(MU.size, size=N, p=W)
 x = rng.normal(MU[component], SIGMA[component], size=N)
@@ -99,14 +95,13 @@ papermill:
   exception: false
   start_time: '2020-12-20T20:45:38.433666'
   status: completed
-tags: []
 ---
 fig, ax = plt.subplots(figsize=(8, 6))
 
 ax.hist(x, bins=30, density=True, lw=0);
 ```
 
-+++ {"papermill": {"duration": 0.012072, "end_time": "2020-12-20T20:45:38.881581", "exception": false, "start_time": "2020-12-20T20:45:38.869509", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.012072, "end_time": "2020-12-20T20:45:38.881581", "exception": false, "start_time": "2020-12-20T20:45:38.869509", "status": "completed"}}
 
 A natural parameterization of the Gaussian mixture model is as the [latent variable model](https://en.wikipedia.org/wiki/Latent_variable_model)
 
@@ -160,7 +155,6 @@ papermill:
   exception: false
   start_time: '2020-12-20T20:45:38.894066'
   status: completed
-tags: []
 ---
 with pm.Model(coords={"cluster": np.arange(len(W)), "obs_id": np.arange(N)}) as model:
     w = pm.Dirichlet("w", np.ones_like(W))
@@ -186,7 +180,6 @@ papermill:
   exception: false
   start_time: '2020-12-20T20:46:50.174773'
   status: completed
-tags: []
 ---
 with model:
     trace = pm.sample(5000, n_init=10000, tune=1000, return_inferencedata=True)
@@ -197,7 +190,7 @@ with model:
 trace.add_groups(posterior_predictive=ppc_trace)
 ```
 
-+++ {"papermill": {"duration": 0.013524, "end_time": "2020-12-20T20:56:38.036405", "exception": false, "start_time": "2020-12-20T20:56:38.022881", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.013524, "end_time": "2020-12-20T20:56:38.036405", "exception": false, "start_time": "2020-12-20T20:56:38.022881", "status": "completed"}}
 
 We see in the following plot that the posterior distribution on the weights and the component means has captured the true value quite well.
 
@@ -209,7 +202,7 @@ az.plot_trace(trace, var_names=["w", "mu"], compact=False);
 az.plot_posterior(trace, var_names=["w", "mu"]);
 ```
 
-+++ {"papermill": {"duration": 0.035988, "end_time": "2020-12-20T20:56:44.871074", "exception": false, "start_time": "2020-12-20T20:56:44.835086", "status": "completed"}, "tags": []}
++++ {"papermill": {"duration": 0.035988, "end_time": "2020-12-20T20:56:44.871074", "exception": false, "start_time": "2020-12-20T20:56:44.835086", "status": "completed"}}
 
 We see that the posterior predictive samples have a distribution quite close to that of the observed data.
 
@@ -231,7 +224,6 @@ papermill:
   exception: false
   start_time: '2020-12-20T20:58:54.903381'
   status: completed
-tags: []
 ---
 %load_ext watermark
 %watermark -n -u -v -iv -w -p theano,xarray
diff --git a/examples/ode_models/ODE_Lotka_Volterra_multiple_ways.ipynb b/examples/ode_models/ODE_Lotka_Volterra_multiple_ways.ipynb
index 3b9a2cd81..59599fcaa 100644
--- a/examples/ode_models/ODE_Lotka_Volterra_multiple_ways.ipynb
+++ b/examples/ode_models/ODE_Lotka_Volterra_multiple_ways.ipynb
@@ -323,7 +323,6 @@
     "    lw=3,\n",
     "    title=\"Hudson's Bay Company Data and\\nExample Model Run\",\n",
     "):\n",
-    "\n",
     "    ax.plot(time, x_y[:, 1], color=\"b\", alpha=alpha, lw=lw, label=\"Lynx (Model)\")\n",
     "    ax.plot(time, x_y[:, 0], color=\"g\", alpha=alpha, lw=lw, label=\"Hare (Model)\")\n",
     "    ax.legend(fontsize=14, loc=\"center left\", bbox_to_anchor=(1, 0.5))\n",
@@ -3611,7 +3610,6 @@
    "source": [
     "# Lotka-Volterra forward simulation model using scan\n",
     "def lv_scan_simulation_model(theta, steps_year=100, years=21):\n",
-    "\n",
     "    # variables to control time steps\n",
     "    n_steps = years * steps_year\n",
     "    dt = 1 / steps_year\n",
@@ -3724,7 +3722,6 @@
    "outputs": [],
    "source": [
     "def lv_scan_inference_model(theta, steps_year=100, years=21):\n",
-    "\n",
     "    # variables to control time steps\n",
     "    n_steps = years * steps_year\n",
     "    dt = 1 / steps_year\n",
diff --git a/examples/ode_models/ODE_Lotka_Volterra_multiple_ways.myst.md b/examples/ode_models/ODE_Lotka_Volterra_multiple_ways.myst.md
index 46fe16471..a6f153dff 100644
--- a/examples/ode_models/ODE_Lotka_Volterra_multiple_ways.myst.md
+++ b/examples/ode_models/ODE_Lotka_Volterra_multiple_ways.myst.md
@@ -10,8 +10,6 @@ kernelspec:
   name: python3
 ---
 
-+++ {"tags": []}
-
 (ODE_Lotka_Volterra_fit_multiple_ways)= 
 # ODE Lotka-Volterra With Bayesian Inference in Multiple Ways
 
@@ -53,13 +51,13 @@ We will first present the Lotka-Volterra predator-prey ODE model and example dat
 ### Key Conclusions
 Based on the experiments in this notebook, the most simple and efficient method for performing Bayesian inference on the Lotka-Volterra equations was to specify the ODE system in Scipy, wrap the function as a Pytensor op, and use a Differential Evolution Metropolis (DEMetropolis) sampler in PyMC.  
 
-+++ {"tags": []}
++++
 
 ## Background
 ### Motivation
 Ordinary differential equation models (ODEs) are used in a variety of science and engineering domains to model the time evolution of physical variables.  A natural choice to estimate the values and uncertainty of model parameters given experimental data is Bayesian inference.  However, ODEs can be challenging to specify and solve in the Bayesian setting, therefore, this notebook steps through multiple methods for solving an ODE inference problem using PyMC. The Lotka-Volterra model used in this example has often been used for benchmarking Bayesian inference methods (e.g., in this Stan [case study](https://mc-stan.org/users/documentation/case-studies/lotka-volterra-predator-prey.html), and in Chapter 16 of *Statistical Rethinking* {cite:p}`mcelreath2018statistical`.
 
-+++ {"tags": []}
++++
 
 ### Lotka-Volterra Predator-Prey Model
 The Lotka-Volterra model describes the interaction between a predator and prey species. This ODE given by:
@@ -71,7 +69,7 @@ $$
 \end{aligned}
 $$
 
-+++ {"tags": []}
++++
 
 The state vector $X(t)=[x(t),y(t)]$ comprises the densities of the prey and the predator species respectively.  Parameters $\boldsymbol{\theta}=[\alpha,\beta,\gamma,\delta, x(0),y(0)]$ are the unknowns that we wish to infer from experimental observations.  $x(0), y(0)$ are the initial values of the states needed to solve the ODE, and $\alpha,\beta,\gamma$, and $\delta$ are unknown model parameters which represent the following:  
 * $\alpha$ is the growing rate of prey when there's no predator.
@@ -79,7 +77,7 @@ The state vector $X(t)=[x(t),y(t)]$ comprises the densities of the prey and the
 * $\gamma$ is the dying rate of predator when there is no prey.
 * $\delta$ is the growing rate of predator in the presence of prey.    
 
-+++ {"tags": []}
++++
 
 ### The Hudson's Bay Company data
 The Lotka-Volterra predator prey model has been used to successfully explain the dynamics of natural populations of predators and prey, such as the lynx and snowshoe hare data of the Hudson's Bay Company. Since the dataset is small, we will hand-enter the values.
@@ -120,7 +118,7 @@ plot_data(ax);
 ### Problem Statement
 The purpose of this analysis is to estimate, with uncertainty, the parameters for the Lotka-Volterra model for the Hudson's Bay Company data from 1900 to 1920.  
 
-+++ {"tags": []}
++++
 
 ## Scipy `odeint`
 
@@ -156,7 +154,6 @@ def plot_model(
     lw=3,
     title="Hudson's Bay Company Data and\nExample Model Run",
 ):
-
     ax.plot(time, x_y[:, 1], color="b", alpha=alpha, lw=lw, label="Lynx (Model)")
     ax.plot(time, x_y[:, 0], color="g", alpha=alpha, lw=lw, label="Hare (Model)")
     ax.legend(fontsize=14, loc="center left", bbox_to_anchor=(1, 0.5))
@@ -712,7 +709,6 @@ Create a function that accepts different numbers of time steps for testing.  The
 ```{code-cell} ipython3
 # Lotka-Volterra forward simulation model using scan
 def lv_scan_simulation_model(theta, steps_year=100, years=21):
-
     # variables to control time steps
     n_steps = years * steps_year
     dt = 1 / steps_year
@@ -778,7 +774,6 @@ Now that we are OK with 100 time steps per year, we write the model with indexin
 
 ```{code-cell} ipython3
 def lv_scan_inference_model(theta, steps_year=100, years=21):
-
     # variables to control time steps
     n_steps = years * steps_year
     dt = 1 / steps_year
@@ -932,14 +927,14 @@ We performed Bayesian inference on a system of ODEs in 4 main ways:
 
 The "winner" for this problem was the Scipy `odeint` solver with a differential evolution (DE) Metropolis sampler and SMC (for a model with a Likelihood) provide good results with SMC being somewhat slower (but also better diagnostics). The improved efficiency of the NUTS sampler did not make up for the inefficiency in using the slow ODE solvers with gradients.  Both DEMetropolis and SMC enable the simplest workflow for a scientist with a working numeric model and the desire to perform Bayesian inference. Just wrapping the numeric model in a Pytensor op and plugging it into a PyMC model can get you a long way!
 
-+++ {"tags": []}
++++
 
 ## Authors
 Organized and rewritten by [Greg Brunkhorst](https://github.com/gbrunkhorst)  from multiple legacy PyMC.io example notebooks by Sanmitra Ghosh, Demetri Pananos, and the PyMC Team ({ref}`ABC_introduction`).
 
 Osvaldo Martin added some clarification about SMC-ABC and  minor fixes in Mar, 2023
 
-+++ {"tags": []}
++++
 
 ## References
 
diff --git a/examples/ode_models/ODE_with_manual_gradients.ipynb b/examples/ode_models/ODE_with_manual_gradients.ipynb
index f2afcf04a..d3aa0a67f 100644
--- a/examples/ode_models/ODE_with_manual_gradients.ipynb
+++ b/examples/ode_models/ODE_with_manual_gradients.ipynb
@@ -301,7 +301,6 @@
    "source": [
     "class solveCached:\n",
     "    def __init__(self, times, n_params, n_outputs):\n",
-    "\n",
     "        self._times = times\n",
     "        self._n_params = n_params\n",
     "        self._n_outputs = n_outputs\n",
@@ -310,7 +309,6 @@
     "        self._cachedState = np.zeros((len(times), n_outputs))\n",
     "\n",
     "    def __call__(self, x):\n",
-    "\n",
     "        if np.all(x == self._cachedParam):\n",
     "            state, sens = self._cachedState, self._cachedSens\n",
     "\n",
@@ -555,7 +553,6 @@
     "\n",
     "# The probabilistic model\n",
     "with pm.Model() as LV_model:\n",
-    "\n",
     "    # Priors for unknown model parameters\n",
     "\n",
     "    alpha = pm.Normal(\"alpha\", mu=1, sigma=0.5)\n",
@@ -1111,7 +1108,6 @@
    "source": [
     "draws = 1000\n",
     "with pm.Model() as FN_model:\n",
-    "\n",
     "    a = pm.Gamma(\"a\", alpha=2, beta=1)\n",
     "    b = pm.Normal(\"b\", mu=0, sigma=1)\n",
     "    c = pm.Uniform(\"c\", lower=0.1, upper=10)\n",
diff --git a/examples/ode_models/ODE_with_manual_gradients.myst.md b/examples/ode_models/ODE_with_manual_gradients.myst.md
index 0866dd944..18f93de97 100644
--- a/examples/ode_models/ODE_with_manual_gradients.myst.md
+++ b/examples/ode_models/ODE_with_manual_gradients.myst.md
@@ -249,7 +249,6 @@ I must point out that the way I have defined the custom ODE Ops above there is t
 ```{code-cell} ipython3
 class solveCached:
     def __init__(self, times, n_params, n_outputs):
-
         self._times = times
         self._n_params = n_params
         self._n_outputs = n_outputs
@@ -258,7 +257,6 @@ class solveCached:
         self._cachedState = np.zeros((len(times), n_outputs))
 
     def __call__(self, x):
-
         if np.all(x == self._cachedParam):
             state, sens = self._cachedState, self._cachedSens
 
@@ -355,7 +353,6 @@ my_ODEop = ODEop(state, numpy_vsp)
 
 # The probabilistic model
 with pm.Model() as LV_model:
-
     # Priors for unknown model parameters
 
     alpha = pm.Normal("alpha", mu=1, sigma=0.5)
@@ -533,7 +530,6 @@ Notice how I have used the `start` argument for this example. Just like `pm.samp
 ```{code-cell} ipython3
 draws = 1000
 with pm.Model() as FN_model:
-
     a = pm.Gamma("a", alpha=2, beta=1)
     b = pm.Normal("b", mu=0, sigma=1)
     c = pm.Uniform("c", lower=0.1, upper=10)
diff --git a/examples/samplers/DEMetropolisZ_EfficiencyComparison.ipynb b/examples/samplers/DEMetropolisZ_EfficiencyComparison.ipynb
index 2a2888213..850e514f6 100644
--- a/examples/samplers/DEMetropolisZ_EfficiencyComparison.ipynb
+++ b/examples/samplers/DEMetropolisZ_EfficiencyComparison.ipynb
@@ -177,7 +177,6 @@
     "def sample_model(\n",
     "    model, D, run=0, step_class=pm.DEMetropolis, cores=1, chains=1, step_kwargs={}, sample_kwargs={}\n",
     "):\n",
-    "\n",
     "    # sampler name\n",
     "    sampler = step_class.name\n",
     "    # sample model\n",
@@ -412,7 +411,6 @@
    "outputs": [],
    "source": [
     "def plot_forest_compare_analytical(results_df):\n",
-    "\n",
     "    # extract the first 5 dimensions\n",
     "    summaries = []\n",
     "    truncated_traces = []\n",
diff --git a/examples/samplers/DEMetropolisZ_EfficiencyComparison.myst.md b/examples/samplers/DEMetropolisZ_EfficiencyComparison.myst.md
index 16eb9c1f2..8762f6249 100644
--- a/examples/samplers/DEMetropolisZ_EfficiencyComparison.myst.md
+++ b/examples/samplers/DEMetropolisZ_EfficiencyComparison.myst.md
@@ -108,7 +108,6 @@ def make_model(mu, cov):
 def sample_model(
     model, D, run=0, step_class=pm.DEMetropolis, cores=1, chains=1, step_kwargs={}, sample_kwargs={}
 ):
-
     # sampler name
     sampler = step_class.name
     # sample model
@@ -268,7 +267,6 @@ def plot_comparison_bars(results_df):
 :tags: [hide-input]
 
 def plot_forest_compare_analytical(results_df):
-
     # extract the first 5 dimensions
     summaries = []
     truncated_traces = []
diff --git a/examples/samplers/DEMetropolisZ_tune_drop_fraction.myst.md b/examples/samplers/DEMetropolisZ_tune_drop_fraction.myst.md
index f4529be1c..4eba98f78 100644
--- a/examples/samplers/DEMetropolisZ_tune_drop_fraction.myst.md
+++ b/examples/samplers/DEMetropolisZ_tune_drop_fraction.myst.md
@@ -92,7 +92,7 @@ In this notebook, a 10-dimensional multivariate normal target density will be sa
 
 We will evaluate these sampling parameters based on three sampling metrics: effective sample size, autocorrelation, and sample acceptance rates.
 
-+++ {"tags": []}
++++
 
 ## Target Distribution
 We use a multivariate normal target density with some correlation in the first few dimensions.  The function `gen_mvnormal_params` generates the parameters for the target distribution.
@@ -171,8 +171,6 @@ def sample_model(model, run=0, step_kwargs={}, sample_kwargs={}):
     return idata, t
 ```
 
-+++ {"tags": []}
-
 ### MCMC Metrics
 To evaluate the MCMC samples, we will use three different metrics.  
 * **Effective sample size (ESS)** is the sample size adjusted for autocorrelation in the trace.  The ESS will be expressed as a percentage ${(ESS)}/{(\text{Total Draws})}*100$.  A higher ESS is better.  For a Metropolis sampler, 2% ESS is fairly high. 
@@ -468,7 +466,7 @@ plot_autocorr_drop_tune_fraction(df_results)
 
 By step 100, the autocorrelation for `drop_tune_fraction` = 0.9 has already declined near zero.  The autocorrelation remains higher for the other `drop_tune_fractions` either because sample steps are too large (`drop_tune_fraction` = 0.0), or too small (`drop_tune_fraction` = 1.0).  When the entire tuning history is dropped (`drop_tune_fraction` = 1.0), the chain has to diverge from its current position back into the typical set, which takes much longer.
 
-+++ {"tags": []}
++++
 
 ### Acceptance Rate
 The rolling mean over the `'accepted'` sampler stat shows the difference in the sampler performance for various `drop_tune_fractions`.  A downward trend in the acceptance rate after tuning indicates that the proposals are starting off too narrow, and an upward trend in the acceptance rate after tuning indicates that the proposals are starting off too wide.
@@ -609,12 +607,10 @@ with model:
 print("ESS % =", calc_ess_pct(idata).mean().round(1))
 ```
 
-+++ {"tags": []}
-
 ## Authors
 Conceived by [Micheal Osthege](https://github.com/michaelosthege), expanded by [Greg Brunkhorst](https://github.com/gbrunkhorst).
 
-+++ {"tags": []}
++++
 
 ## References
 
diff --git a/examples/samplers/MLDA_gravity_surveying.ipynb b/examples/samplers/MLDA_gravity_surveying.ipynb
index f681a0d58..c542336f3 100644
--- a/examples/samplers/MLDA_gravity_surveying.ipynb
+++ b/examples/samplers/MLDA_gravity_surveying.ipynb
@@ -321,7 +321,6 @@
     "    \"\"\"\n",
     "\n",
     "    def __init__(self, f_function, depth, n_quad, n_data):\n",
-    "\n",
     "        # Set the function describing the distribution of subsurface density.\n",
     "        self.f_function = f_function\n",
     "\n",
@@ -366,7 +365,6 @@
     "        self.g = np.dot(self.K, self.f)\n",
     "\n",
     "    def plot_model(self):\n",
-    "\n",
     "        # Plot the density and the signal.\n",
     "        fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n",
     "        axes[0].set_title(\"Density\")\n",
@@ -388,7 +386,6 @@
     "        plt.show()\n",
     "\n",
     "    def plot_kernel(self):\n",
-    "\n",
     "        # Plot the kernel.\n",
     "        plt.figure(figsize=(8, 6))\n",
     "        plt.imshow(self.K, cmap=\"plasma\")\n",
@@ -492,7 +489,6 @@
     "    \"\"\"\n",
     "\n",
     "    def __init__(self, depth, n_quad, n_data):\n",
-    "\n",
     "        # Set the depth of the density (distance to the surface measurements).\n",
     "        self.depth = depth\n",
     "\n",
@@ -528,7 +524,6 @@
     "        self.K = self.w * self.depth / dist**3\n",
     "\n",
     "    def set_random_process(self, random_process, lamb, mkl):\n",
-    "\n",
     "        # Set the number of KL modes.\n",
     "        self.mkl = mkl\n",
     "\n",
@@ -538,7 +533,6 @@
     "        self.random_process.compute_eigenpairs()\n",
     "\n",
     "    def solve(self, parameters):\n",
-    "\n",
     "        # Internalise the Random Field parameters\n",
     "        self.parameters = parameters\n",
     "\n",
@@ -554,7 +548,6 @@
     "        self.g = np.dot(self.K, self.f)\n",
     "\n",
     "    def get_data(self):\n",
-    "\n",
     "        # Get the data vector.\n",
     "        return self.g"
    ]
@@ -872,7 +865,6 @@
     "coarse_models = []\n",
     "for j in range(len(my_models) - 1):\n",
     "    with pm.Model() as model:\n",
-    "\n",
     "        # Multivariate normal prior.\n",
     "        theta = pm.MvNormal(\"theta\", mu=mu_prior, cov=cov_prior, shape=mkl)\n",
     "\n",
@@ -1246,7 +1238,6 @@
     "runtimes = []\n",
     "\n",
     "with pm.Model() as model:\n",
-    "\n",
     "    # Multivariate normal prior.\n",
     "    theta = pm.MvNormal(\"theta\", mu=mu_prior, cov=cov_prior, shape=mkl)\n",
     "\n",
diff --git a/examples/samplers/MLDA_gravity_surveying.myst.md b/examples/samplers/MLDA_gravity_surveying.myst.md
index e0f892688..2850279cf 100644
--- a/examples/samplers/MLDA_gravity_surveying.myst.md
+++ b/examples/samplers/MLDA_gravity_surveying.myst.md
@@ -266,7 +266,6 @@ class Gravity:
     """
 
     def __init__(self, f_function, depth, n_quad, n_data):
-
         # Set the function describing the distribution of subsurface density.
         self.f_function = f_function
 
@@ -311,7 +310,6 @@ class Gravity:
         self.g = np.dot(self.K, self.f)
 
     def plot_model(self):
-
         # Plot the density and the signal.
         fig, axes = plt.subplots(1, 2, figsize=(16, 6))
         axes[0].set_title("Density")
@@ -333,7 +331,6 @@ class Gravity:
         plt.show()
 
     def plot_kernel(self):
-
         # Plot the kernel.
         plt.figure(figsize=(8, 6))
         plt.imshow(self.K, cmap="plasma")
@@ -390,7 +387,6 @@ class Gravity_Forward(Gravity):
     """
 
     def __init__(self, depth, n_quad, n_data):
-
         # Set the depth of the density (distance to the surface measurements).
         self.depth = depth
 
@@ -426,7 +422,6 @@ class Gravity_Forward(Gravity):
         self.K = self.w * self.depth / dist**3
 
     def set_random_process(self, random_process, lamb, mkl):
-
         # Set the number of KL modes.
         self.mkl = mkl
 
@@ -436,7 +431,6 @@ class Gravity_Forward(Gravity):
         self.random_process.compute_eigenpairs()
 
     def solve(self, parameters):
-
         # Internalise the Random Field parameters
         self.parameters = parameters
 
@@ -452,7 +446,6 @@ class Gravity_Forward(Gravity):
         self.g = np.dot(self.K, self.f)
 
     def get_data(self):
-
         # Get the data vector.
         return self.g
 ```
@@ -633,7 +626,6 @@ for i, m_i in enumerate(my_models):
 coarse_models = []
 for j in range(len(my_models) - 1):
     with pm.Model() as model:
-
         # Multivariate normal prior.
         theta = pm.MvNormal("theta", mu=mu_prior, cov=cov_prior, shape=mkl)
 
@@ -654,7 +646,6 @@ traces = []
 runtimes = []
 
 with pm.Model() as model:
-
     # Multivariate normal prior.
     theta = pm.MvNormal("theta", mu=mu_prior, cov=cov_prior, shape=mkl)
 
diff --git a/examples/samplers/SMC-ABC_Lotka-Volterra_example.ipynb b/examples/samplers/SMC-ABC_Lotka-Volterra_example.ipynb
index afe778668..ae8513626 100644
--- a/examples/samplers/SMC-ABC_Lotka-Volterra_example.ipynb
+++ b/examples/samplers/SMC-ABC_Lotka-Volterra_example.ipynb
@@ -403,6 +403,7 @@
     "time = 15\n",
     "t = np.linspace(0, time, size)\n",
     "\n",
+    "\n",
     "# Lotka - Volterra equation\n",
     "def dX_dt(X, t, a, b, c, d):\n",
     "    \"\"\"Return the growth rate of fox and rabbit populations.\"\"\"\n",
diff --git a/examples/samplers/SMC-ABC_Lotka-Volterra_example.myst.md b/examples/samplers/SMC-ABC_Lotka-Volterra_example.myst.md
index 6d850e97e..5a5e113f2 100644
--- a/examples/samplers/SMC-ABC_Lotka-Volterra_example.myst.md
+++ b/examples/samplers/SMC-ABC_Lotka-Volterra_example.myst.md
@@ -139,6 +139,7 @@ size = 100
 time = 15
 t = np.linspace(0, time, size)
 
+
 # Lotka - Volterra equation
 def dX_dt(X, t, a, b, c, d):
     """Return the growth rate of fox and rabbit populations."""
diff --git a/examples/survival_analysis/survival_analysis.ipynb b/examples/survival_analysis/survival_analysis.ipynb
index 825a98aa3..1c19c207e 100644
--- a/examples/survival_analysis/survival_analysis.ipynb
+++ b/examples/survival_analysis/survival_analysis.ipynb
@@ -452,7 +452,6 @@
     "coords = {\"intervals\": intervals}\n",
     "\n",
     "with pm.Model(coords=coords) as model:\n",
-    "\n",
     "    lambda0 = pm.Gamma(\"lambda0\", 0.01, 0.01, dims=\"intervals\")\n",
     "\n",
     "    beta = pm.Normal(\"beta\", 0, sigma=1000)\n",
@@ -1117,7 +1116,6 @@
     "coords = {\"intervals\": intervals}\n",
     "\n",
     "with pm.Model(coords=coords) as time_varying_model:\n",
-    "\n",
     "    lambda0 = pm.Gamma(\"lambda0\", 0.01, 0.01, dims=\"intervals\")\n",
     "    beta = GaussianRandomWalk(\"beta\", init_dist=pm.Normal.dist(), sigma=1.0, dims=\"intervals\")\n",
     "\n",
diff --git a/examples/survival_analysis/survival_analysis.myst.md b/examples/survival_analysis/survival_analysis.myst.md
index 1d39fefea..10289944c 100644
--- a/examples/survival_analysis/survival_analysis.myst.md
+++ b/examples/survival_analysis/survival_analysis.myst.md
@@ -226,7 +226,6 @@ We may approximate $d_{i, j}$ with a Poisson random variable with mean $t_{i, j}
 coords = {"intervals": intervals}
 
 with pm.Model(coords=coords) as model:
-
     lambda0 = pm.Gamma("lambda0", 0.01, 0.01, dims="intervals")
 
     beta = pm.Normal("beta", 0, sigma=1000)
@@ -345,7 +344,6 @@ We implement this model in `pymc` as follows.
 coords = {"intervals": intervals}
 
 with pm.Model(coords=coords) as time_varying_model:
-
     lambda0 = pm.Gamma("lambda0", 0.01, 0.01, dims="intervals")
     beta = GaussianRandomWalk("beta", init_dist=pm.Normal.dist(), sigma=1.0, dims="intervals")
 
diff --git a/examples/time_series/Euler-Maruyama_and_SDEs.ipynb b/examples/time_series/Euler-Maruyama_and_SDEs.ipynb
index 2854bcf00..dddb0a9e2 100644
--- a/examples/time_series/Euler-Maruyama_and_SDEs.ipynb
+++ b/examples/time_series/Euler-Maruyama_and_SDEs.ipynb
@@ -242,7 +242,6 @@
    "outputs": [],
    "source": [
     "with pm.Model() as model:\n",
-    "\n",
     "    # uniform prior, but we know it must be negative\n",
     "    lam = pm.Flat(\"lam\")\n",
     "\n",
diff --git a/examples/time_series/Euler-Maruyama_and_SDEs.myst.md b/examples/time_series/Euler-Maruyama_and_SDEs.myst.md
index 699ab8dc1..1dc11067f 100644
--- a/examples/time_series/Euler-Maruyama_and_SDEs.myst.md
+++ b/examples/time_series/Euler-Maruyama_and_SDEs.myst.md
@@ -133,7 +133,6 @@ slideshow:
   slide_type: '-'
 ---
 with pm.Model() as model:
-
     # uniform prior, but we know it must be negative
     lam = pm.Flat("lam")
 
diff --git a/examples/time_series/MvGaussianRandomWalk_demo.myst.md b/examples/time_series/MvGaussianRandomWalk_demo.myst.md
index 56bda7665..875511760 100644
--- a/examples/time_series/MvGaussianRandomWalk_demo.myst.md
+++ b/examples/time_series/MvGaussianRandomWalk_demo.myst.md
@@ -183,7 +183,7 @@ az.plot_energy(trace)
 az.plot_ppc(trace);
 ```
 
-+++ {"jupyter": {"outputs_hidden": true}, "tags": []}
++++ {"jupyter": {"outputs_hidden": true}}
 
 ## Posterior visualisation
 The graphs above look good. Now we plot the observed 3-dimensional series against the average predicted 3-dimensional series, or in other words, we plot the data against the estimated regression curve from the model {eq}`eqn:model`.
@@ -211,8 +211,6 @@ ax.set_title("Predicted Mean of Three Correlated Series");
 Finally, we plot the data against the posterior predictive samples.
 
 ```{code-cell} ipython3
-:tags: []
-
 # Rescale the posterior predictive samples
 ppc_y = y_scaler.inverse_transform(trace.posterior_predictive["y"].mean("chain"))
 
diff --git a/examples/variational_inference/GLM-hierarchical-advi-minibatch.ipynb b/examples/variational_inference/GLM-hierarchical-advi-minibatch.ipynb
index f9d7198b2..36685d386 100644
--- a/examples/variational_inference/GLM-hierarchical-advi-minibatch.ipynb
+++ b/examples/variational_inference/GLM-hierarchical-advi-minibatch.ipynb
@@ -501,7 +501,6 @@
    "outputs": [],
    "source": [
     "with hierarchical_model:\n",
-    "\n",
     "    a = pm.Normal(\"alpha\", mu=mu_a, sigma=sigma_a, dims=\"counties\")\n",
     "    # Intercept for each county, distributed around group mean mu_a\n",
     "    b = pm.Normal(\"beta\", mu=mu_b, sigma=sigma_b, dims=\"counties\")"
@@ -521,7 +520,6 @@
    "outputs": [],
    "source": [
     "with hierarchical_model:\n",
-    "\n",
     "    radon_est = a[county_idx_t] + b[county_idx_t] * floor_idx_t"
    ]
   },
@@ -539,7 +537,6 @@
    "outputs": [],
    "source": [
     "with hierarchical_model:\n",
-    "\n",
     "    # Model error\n",
     "    eps = pm.Uniform(\"eps\", lower=0, upper=100)\n",
     "\n",
@@ -779,7 +776,6 @@
    "source": [
     "# Inference button (TM)!\n",
     "with pm.Model(coords=coords):\n",
-    "\n",
     "    mu_a = pm.Normal(\"mu_alpha\", mu=0.0, sigma=100**2)\n",
     "    sigma_a = pm.Uniform(\"sigma_alpha\", lower=0, upper=100)\n",
     "    mu_b = pm.Normal(\"mu_beta\", mu=0.0, sigma=100**2)\n",
diff --git a/examples/variational_inference/GLM-hierarchical-advi-minibatch.myst.md b/examples/variational_inference/GLM-hierarchical-advi-minibatch.myst.md
index 8f0c1c916..e6ef60e9e 100644
--- a/examples/variational_inference/GLM-hierarchical-advi-minibatch.myst.md
+++ b/examples/variational_inference/GLM-hierarchical-advi-minibatch.myst.md
@@ -85,7 +85,6 @@ Intercept for each county, distributed around group mean `mu_a`. Above we just s
 
 ```{code-cell} ipython3
 with hierarchical_model:
-
     a = pm.Normal("alpha", mu=mu_a, sigma=sigma_a, dims="counties")
     # Intercept for each county, distributed around group mean mu_a
     b = pm.Normal("beta", mu=mu_b, sigma=sigma_b, dims="counties")
@@ -95,7 +94,6 @@ Model prediction of radon level `a[county_idx]` translates to `a[0, 0, 0, 1, 1,
 
 ```{code-cell} ipython3
 with hierarchical_model:
-
     radon_est = a[county_idx_t] + b[county_idx_t] * floor_idx_t
 ```
 
@@ -103,7 +101,6 @@ Finally, we specify the likelihood:
 
 ```{code-cell} ipython3
 with hierarchical_model:
-
     # Model error
     eps = pm.Uniform("eps", lower=0, upper=100)
 
@@ -153,7 +150,6 @@ start_dict = list(sample[i] for i in range(n_chains))
 ```{code-cell} ipython3
 # Inference button (TM)!
 with pm.Model(coords=coords):
-
     mu_a = pm.Normal("mu_alpha", mu=0.0, sigma=100**2)
     sigma_a = pm.Uniform("sigma_alpha", lower=0, upper=100)
     mu_b = pm.Normal("mu_beta", mu=0.0, sigma=100**2)
diff --git a/examples/variational_inference/variational_api_quickstart.ipynb b/examples/variational_inference/variational_api_quickstart.ipynb
index 44c5ae1c4..5c2fa1ce6 100644
--- a/examples/variational_inference/variational_api_quickstart.ipynb
+++ b/examples/variational_inference/variational_api_quickstart.ipynb
@@ -81,7 +81,6 @@
    "outputs": [],
    "source": [
     "with pm.Model() as gamma_model:\n",
-    "\n",
     "    alpha = pm.Exponential(\"alpha\", 0.1)\n",
     "    beta = pm.Exponential(\"beta\", 0.1)\n",
     "\n",
@@ -147,7 +146,6 @@
    ],
    "source": [
     "with gamma_model:\n",
-    "\n",
     "    # mean_field = pm.fit()\n",
     "    mean_field = pm.fit(obj_optimizer=pm.adagrad_window(learning_rate=1e-2))"
    ]
@@ -450,7 +448,6 @@
    "outputs": [],
    "source": [
     "with pm.Model() as model:\n",
-    "\n",
     "    x = pm.NormalMixture(\"x\", w=w, mu=mu, sigma=sd)\n",
     "    x2 = x**2\n",
     "    sin_x = pm.math.sin(x)"
@@ -1514,7 +1511,6 @@
     "yt = pytensor.shared(y_train)\n",
     "\n",
     "with pm.Model() as iris_model:\n",
-    "\n",
     "    # Coefficients for features\n",
     "    β = pm.Normal(\"β\", 0, sigma=1e2, shape=(4, 3))\n",
     "    # Transoform to unit interval\n",
@@ -1546,7 +1542,6 @@
    "outputs": [],
    "source": [
     "with iris_model:\n",
-    "\n",
     "    # We'll use SVGD\n",
     "    inference = pm.SVGD(n_particles=500, jitter=1)\n",
     "\n",
@@ -1920,7 +1915,6 @@
     "X = pm.Minibatch(data, batch_size=500)\n",
     "\n",
     "with pm.Model() as model:\n",
-    "\n",
     "    mu = pm.Normal(\"mu\", 0, sigma=1e5, shape=(100,))\n",
     "    sd = pm.HalfNormal(\"sd\", shape=(100,))\n",
     "    likelihood = pm.Normal(\"likelihood\", mu, sigma=sd, observed=X, total_size=data.shape)"
diff --git a/examples/variational_inference/variational_api_quickstart.myst.md b/examples/variational_inference/variational_api_quickstart.myst.md
index 8de458046..8a15e4b05 100644
--- a/examples/variational_inference/variational_api_quickstart.myst.md
+++ b/examples/variational_inference/variational_api_quickstart.myst.md
@@ -54,7 +54,6 @@ sns.histplot(gamma_data);
 
 ```{code-cell} ipython3
 with pm.Model() as gamma_model:
-
     alpha = pm.Exponential("alpha", 0.1)
     beta = pm.Exponential("beta", 0.1)
 
@@ -63,7 +62,6 @@ with pm.Model() as gamma_model:
 
 ```{code-cell} ipython3
 with gamma_model:
-
     # mean_field = pm.fit()
     mean_field = pm.fit(obj_optimizer=pm.adagrad_window(learning_rate=1e-2))
 ```
@@ -133,7 +131,6 @@ Let's use the same model:
 
 ```{code-cell} ipython3
 with pm.Model() as model:
-
     x = pm.NormalMixture("x", w=w, mu=mu, sigma=sd)
     x2 = x**2
     sin_x = pm.math.sin(x)
@@ -397,7 +394,6 @@ Xt = pytensor.shared(X_train)
 yt = pytensor.shared(y_train)
 
 with pm.Model() as iris_model:
-
     # Coefficients for features
     β = pm.Normal("β", 0, sigma=1e2, shape=(4, 3))
     # Transoform to unit interval
@@ -418,7 +414,6 @@ PyMC models have symbolic inputs for latent variables. To evaluate an expression
 
 ```{code-cell} ipython3
 with iris_model:
-
     # We'll use SVGD
     inference = pm.SVGD(n_particles=500, jitter=1)
 
@@ -543,7 +538,6 @@ Now let's use minibatches. At every iteration, we will draw 500 random values:
 X = pm.Minibatch(data, batch_size=500)
 
 with pm.Model() as model:
-
     mu = pm.Normal("mu", 0, sigma=1e5, shape=(100,))
     sd = pm.HalfNormal("sd", shape=(100,))
     likelihood = pm.Normal("likelihood", mu, sigma=sd, observed=X, total_size=data.shape)
diff --git a/sphinxext/thumbnail_extractor.py b/sphinxext/thumbnail_extractor.py
index 4cae83c01..ed6e41c14 100644
--- a/sphinxext/thumbnail_extractor.py
+++ b/sphinxext/thumbnail_extractor.py
@@ -184,7 +184,6 @@ def main(app):
     file = [HEAD]
 
     for folder, title in folder_title_map.items():
-
         nb_paths = glob(f"{folder}/*.ipynb")
         file.append(
             SECTION_TEMPLATE.format(