diff --git a/examples/case_studies/baseball.py b/examples/case_studies/baseball.py
deleted file mode 100644
index f861099a7..000000000
--- a/examples/case_studies/baseball.py
+++ /dev/null
@@ -1,41 +0,0 @@
-#
-# Demonstrates the usage of hierarchical partial pooling
-# See http://mc-stan.org/documentation/case-studies/pool-binary-trials.html for more details
-#
-
-import numpy as np
-
-import pymc3 as pm
-
-
-def build_model():
-    data = np.loadtxt(
-        pm.get_data("efron-morris-75-data.tsv"), delimiter="\t", skiprows=1, usecols=(2, 3)
-    )
-
-    atbats = pm.floatX(data[:, 0])
-    hits = pm.floatX(data[:, 1])
-
-    N = len(hits)
-
-    # we want to bound the kappa below
-    BoundedKappa = pm.Bound(pm.Pareto, lower=1.0)
-
-    with pm.Model() as model:
-        phi = pm.Uniform("phi", lower=0.0, upper=1.0)
-        kappa = BoundedKappa("kappa", alpha=1.0001, m=1.5)
-        thetas = pm.Beta("thetas", alpha=phi * kappa, beta=(1.0 - phi) * kappa, shape=N)
-        ys = pm.Binomial("ys", n=atbats, p=thetas, observed=hits)
-    return model
-
-
-def run(n=2000):
-    model = build_model()
-    with model:
-        trace = pm.sample(n, target_accept=0.99)
-
-    pm.traceplot(trace)
-
-
-if __name__ == "__main__":
-    run()
diff --git a/examples/case_studies/disaster_model.py b/examples/case_studies/disaster_model.py
deleted file mode 100644
index 432c5efc3..000000000
--- a/examples/case_studies/disaster_model.py
+++ /dev/null
@@ -1,154 +0,0 @@
-"""
-A model for the disasters data with a changepoint
-
-changepoint ~ U(1851, 1962)
-early_mean ~ Exp(1.)
-late_mean ~ Exp(1.)
-disasters[t] ~ Poi(early_mean if t <= switchpoint, late_mean otherwise)
-
-"""
-
-import arviz as az
-import theano.tensor as tt
-
-from numpy import arange, array
-
-import pymc3 as pm
-
-__all__ = ["disasters_data", "switchpoint", "early_mean", "late_mean", "rate", "disasters"]
-
-
-# Time series of recorded coal mining disasters in the UK from 1851 to 1962
-disasters_data = array(
-    [
-        4,
-        5,
-        4,
-        0,
-        1,
-        4,
-        3,
-        4,
-        0,
-        6,
-        3,
-        3,
-        4,
-        0,
-        2,
-        6,
-        3,
-        3,
-        5,
-        4,
-        5,
-        3,
-        1,
-        4,
-        4,
-        1,
-        5,
-        5,
-        3,
-        4,
-        2,
-        5,
-        2,
-        2,
-        3,
-        4,
-        2,
-        1,
-        3,
-        2,
-        2,
-        1,
-        1,
-        1,
-        1,
-        3,
-        0,
-        0,
-        1,
-        0,
-        1,
-        1,
-        0,
-        0,
-        3,
-        1,
-        0,
-        3,
-        2,
-        2,
-        0,
-        1,
-        1,
-        1,
-        0,
-        1,
-        0,
-        1,
-        0,
-        0,
-        0,
-        2,
-        1,
-        0,
-        0,
-        0,
-        1,
-        1,
-        0,
-        2,
-        3,
-        3,
-        1,
-        1,
-        2,
-        1,
-        1,
-        1,
-        1,
-        2,
-        4,
-        2,
-        0,
-        0,
-        1,
-        4,
-        0,
-        0,
-        0,
-        1,
-        0,
-        0,
-        0,
-        0,
-        0,
-        1,
-        0,
-        0,
-        1,
-        0,
-        1,
-    ]
-)
-year = arange(1851, 1962)
-
-with pm.Model() as model:
-    switchpoint = pm.DiscreteUniform("switchpoint", lower=year.min(), upper=year.max())
-    early_mean = pm.Exponential("early_mean", lam=1.0)
-    late_mean = pm.Exponential("late_mean", lam=1.0)
-
-    # Allocate appropriate Poisson rates to years before and after current
-    # switchpoint location
-    rate = tt.switch(switchpoint >= year, early_mean, late_mean)
-
-    disasters = pm.Poisson("disasters", rate, observed=disasters_data)
-
-    # Initial values for stochastic nodes
-    start = {"early_mean": 2.0, "late_mean": 3.0}
-
-    tr = pm.sample(1000, tune=500, start=start, cores=1)
-    az.plot_trace(tr)
diff --git a/examples/case_studies/disaster_model_theano_op.py b/examples/case_studies/disaster_model_theano_op.py
deleted file mode 100644
index 6f58dbb2d..000000000
--- a/examples/case_studies/disaster_model_theano_op.py
+++ /dev/null
@@ -1,168 +0,0 @@
-"""
-Similar to disaster_model.py, but for arbitrary
-deterministics which are not not working with Theano.
-Note that gradient based samplers will not work.
-"""
-
-import arviz as az
-import theano.tensor as tt
-
-from numpy import arange, array, empty
-from theano.compile.ops import as_op
-
-import pymc3 as pm
-
-__all__ = ["disasters_data", "switchpoint", "early_mean", "late_mean", "rate", "disasters"]
-
-# Time series of recorded coal mining disasters in the UK from 1851 to 1962
-disasters_data = array(
-    [
-        4,
-        5,
-        4,
-        0,
-        1,
-        4,
-        3,
-        4,
-        0,
-        6,
-        3,
-        3,
-        4,
-        0,
-        2,
-        6,
-        3,
-        3,
-        5,
-        4,
-        5,
-        3,
-        1,
-        4,
-        4,
-        1,
-        5,
-        5,
-        3,
-        4,
-        2,
-        5,
-        2,
-        2,
-        3,
-        4,
-        2,
-        1,
-        3,
-        2,
-        2,
-        1,
-        1,
-        1,
-        1,
-        3,
-        0,
-        0,
-        1,
-        0,
-        1,
-        1,
-        0,
-        0,
-        3,
-        1,
-        0,
-        3,
-        2,
-        2,
-        0,
-        1,
-        1,
-        1,
-        0,
-        1,
-        0,
-        1,
-        0,
-        0,
-        0,
-        2,
-        1,
-        0,
-        0,
-        0,
-        1,
-        1,
-        0,
-        2,
-        3,
-        3,
-        1,
-        1,
-        2,
-        1,
-        1,
-        1,
-        1,
-        2,
-        4,
-        2,
-        0,
-        0,
-        1,
-        4,
-        0,
-        0,
-        0,
-        1,
-        0,
-        0,
-        0,
-        0,
-        0,
-        1,
-        0,
-        0,
-        1,
-        0,
-        1,
-    ]
-)
-years = len(disasters_data)
-
-
-@as_op(itypes=[tt.lscalar, tt.dscalar, tt.dscalar], otypes=[tt.dvector])
-def rate_(switchpoint, early_mean, late_mean):
-    out = empty(years)
-    out[:switchpoint] = early_mean
-    out[switchpoint:] = late_mean
-    return out
-
-
-with pm.Model() as model:
-    # Prior for distribution of switchpoint location
-    switchpoint = pm.DiscreteUniform("switchpoint", lower=0, upper=years)
-    # Priors for pre- and post-switch mean number of disasters
-    early_mean = pm.Exponential("early_mean", lam=1.0)
-    late_mean = pm.Exponential("late_mean", lam=1.0)
-
-    # Allocate appropriate Poisson rates to years before and after current
-    # switchpoint location
-    idx = arange(years)
-    rate = rate_(switchpoint, early_mean, late_mean)
-
-    # Data likelihood
-    disasters = pm.Poisson("disasters", rate, observed=disasters_data)
-
-    # Use slice sampler for means
-    step1 = pm.Slice([early_mean, late_mean])
-    # Use Metropolis for switchpoint, since it accommodates discrete variables
-    step2 = pm.Metropolis([switchpoint])
-
-    # Initial values for stochastic nodes
-    start = {"early_mean": 2.0, "late_mean": 3.0}
-
-    tr = pm.sample(1000, tune=500, start=start, step=[step1, step2], cores=1)
-    az.plot_trace(tr)
diff --git a/examples/case_studies/gelman_bioassay.py b/examples/case_studies/gelman_bioassay.py
deleted file mode 100644
index 3bead2e26..000000000
--- a/examples/case_studies/gelman_bioassay.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from numpy import array, ones
-
-import pymc3 as pm
-
-# Samples for each dose level
-n = 5 * ones(4, dtype=int)
-# Log-dose
-dose = array([-0.86, -0.3, -0.05, 0.73])
-
-with pm.Model() as model:
-    # Logit-linear model parameters
-    alpha = pm.Normal("alpha", 0, sigma=100.0)
-    beta = pm.Normal("beta", 0, sigma=1.0)
-
-    # Calculate probabilities of death
-    theta = pm.Deterministic("theta", pm.math.invlogit(alpha + beta * dose))
-
-    # Data likelihood
-    deaths = pm.Binomial("deaths", n=n, p=theta, observed=[0, 1, 3, 5])
-
-
-def run(n=1000):
-    if n == "short":
-        n = 50
-    with model:
-        pm.sample(n, tune=1000)
-
-
-if __name__ == "__main__":
-    run()
diff --git a/examples/case_studies/gelman_schools.py b/examples/case_studies/gelman_schools.py
deleted file mode 100644
index 4066e9320..000000000
--- a/examples/case_studies/gelman_schools.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import numpy as np
-
-from arviz import loo
-from pymc3 import HalfCauchy, Model, Normal, sample
-
-"""Original Stan model
-
-data {
-  int<lower=0> J; // number of schools
-  real y[J]; // estimated treatment effects
-  real<lower=0> sigma[J]; // s.e. of effect estimates
-}
-parameters {
-  real mu;
-  real<lower=0> tau;
-  real eta[J];
-}
-transformed parameters {
-  real theta[J];
-  for (j in 1:J)
-    theta[j] <- mu + tau * eta[j];
-}
-model {
-  eta ~ normal(0, 1);
-  y ~ normal(theta, sigma);
-}
-"""
-
-J = 8
-y = np.array([28, 8, -3, 7, -1, 1, 18, 12])
-sigma = np.array([15, 10, 16, 11, 9, 11, 10, 18])
-
-with Model() as schools:
-    eta = Normal("eta", 0, 1, shape=J)
-    mu = Normal("mu", 0, sigma=1e6)
-    tau = HalfCauchy("tau", 25)
-
-    theta = mu + tau * eta
-
-    obs = Normal("obs", theta, sigma=sigma, observed=y)
-
-
-def run(n=1000):
-    if n == "short":
-        n = 50
-    with schools:
-        tr = sample(n)
-        loo(tr)
-
-
-if __name__ == "__main__":
-    run()
diff --git a/examples/case_studies/lasso_missing.py b/examples/case_studies/lasso_missing.py
deleted file mode 100644
index 1757811ed..000000000
--- a/examples/case_studies/lasso_missing.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import pandas as pd
-
-from numpy.ma import masked_values
-
-import pymc3 as pm
-
-# Import data, filling missing values with sentinels (-999)
-test_scores = pd.read_csv(pm.get_data("test_scores.csv")).fillna(-999)
-
-# Extract variables: test score, gender, number of siblings, previous disability, age,
-# mother with HS education or better, hearing loss identified by 3 months
-# of age
-(score, male, siblings, disability, age, mother_hs, early_ident) = (
-    test_scores[["score", "male", "siblings", "prev_disab", "age_test", "mother_hs", "early_ident"]]
-    .astype(float)
-    .values.T
-)
-
-with pm.Model() as model:
-    # Impute missing values
-    sib_mean = pm.Exponential("sib_mean", 1.0)
-    siblings_imp = pm.Poisson("siblings_imp", sib_mean, observed=siblings)
-
-    p_disab = pm.Beta("p_disab", 1.0, 1.0)
-    disability_imp = pm.Bernoulli(
-        "disability_imp", p_disab, observed=masked_values(disability, value=-999)
-    )
-
-    p_mother = pm.Beta("p_mother", 1.0, 1.0)
-    mother_imp = pm.Bernoulli("mother_imp", p_mother, observed=masked_values(mother_hs, value=-999))
-
-    s = pm.HalfCauchy("s", 5.0, testval=5)
-    beta = pm.Laplace("beta", 0.0, 100.0, shape=7, testval=0.1)
-
-    expected_score = (
-        beta[0]
-        + beta[1] * male
-        + beta[2] * siblings_imp
-        + beta[3] * disability_imp
-        + beta[4] * age
-        + beta[5] * mother_imp
-        + beta[6] * early_ident
-    )
-
-    observed_score = pm.Normal("observed_score", expected_score, s, observed=score)
-
-
-with model:
-    start = pm.find_MAP()
-    step1 = pm.NUTS([beta, s, p_disab, p_mother, sib_mean], scaling=start)
-    step2 = pm.BinaryGibbsMetropolis([mother_imp.missing_values, disability_imp.missing_values])
-
-
-def run(n=5000):
-    if n == "short":
-        n = 100
-    with model:
-        pm.sample(n, step=[step1, step2], start=start)
-
-
-if __name__ == "__main__":
-    run()
diff --git a/examples/case_studies/lightspeed_example.py b/examples/case_studies/lightspeed_example.py
deleted file mode 100644
index 061a28617..000000000
--- a/examples/case_studies/lightspeed_example.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import numpy as np
-import arviz as az
-import pymc3 as pm
-
-light_speed = np.array(
-    [
-        28,
-        26,
-        33,
-        24,
-        34,
-        -44,
-        27,
-        16,
-        40,
-        -2,
-        29,
-        22,
-        24,
-        21,
-        25,
-        30,
-        23,
-        29,
-        31,
-        19,
-        24,
-        20,
-        36,
-        32,
-        36,
-        28,
-        25,
-        21,
-        28,
-        29,
-        37,
-        25,
-        28,
-        26,
-        30,
-        32,
-        36,
-        26,
-        30,
-        22,
-        36,
-        23,
-        27,
-        27,
-        28,
-        27,
-        31,
-        27,
-        26,
-        33,
-        26,
-        32,
-        32,
-        24,
-        39,
-        28,
-        24,
-        25,
-        32,
-        25,
-        29,
-        27,
-        28,
-        29,
-        16,
-        23,
-    ]
-)
-
-model_1 = pm.Model()
-
-with model_1:
-    # priors as specified in stan model
-    # mu = pm.Uniform('mu', lower = -tt.inf, upper= np.inf)
-    # sigma = pm.Uniform('sigma', lower = 0, upper= np.inf)
-
-    # using vague priors works
-    mu = pm.Uniform("mu", lower=light_speed.std() / 1000.0, upper=light_speed.std() * 1000.0)
-    sigma = pm.Uniform("sigma", lower=light_speed.std() / 1000.0, upper=light_speed.std() * 1000.0)
-
-    # define likelihood
-    y_obs = pm.Normal("Y_obs", mu=mu, sigma=sigma, observed=light_speed)
-
-
-def run(n=5000):
-    with model_1:
-        trace = pm.sample(n)
-
-        az.summary(trace)
-
-
-if __name__ == "__main__":
-    run()