@@ -13,7 +13,7 @@ kernelspec:
1313(GLM-out-of-sample-predictions)=
1414# Out-Of-Sample Predictions
1515
16- :::{post} December, 2022
16+ :::{post} December, 2023
1717:tags: generalized linear model, logistic regression, out of sample predictions, patsy
1818:category: beginner
1919:::
@@ -23,13 +23,11 @@ import arviz as az
2323import matplotlib.pyplot as plt
2424import numpy as np
2525import pandas as pd
26- import patsy
2726import pymc as pm
2827import seaborn as sns
2928
3029from scipy.special import expit as inverse_logit
31- from sklearn.metrics import RocCurveDisplay, accuracy_score, auc, roc_curve
32- from sklearn.model_selection import train_test_split
30+ from sklearn.metrics import RocCurveDisplay, auc, roc_curve
3331```
3432
3533``` {code-cell} ipython3
@@ -55,7 +53,7 @@ beta_x2 = -1
5553beta_interaction = 2
5654z = intercept + beta_x1 * x1 + beta_x2 * x2 + beta_interaction * x1 * x2
5755p = inverse_logit(z)
58- # note binimial with n=1 is equal to a bernoulli
56+ # note binomial with n=1 is equal to a Bernoulli
5957y = rng.binomial(n=1, p=p, size=n)
6058df = pd.DataFrame(dict(x1=x1, x2=x2, y=y))
6159df.head()
@@ -81,26 +79,33 @@ ax.set(title="Sample Data", xlim=(-9, 9), ylim=(-9, 9));
8179## Prepare Data for Modeling
8280
8381``` {code-cell} ipython3
84- y, x = patsy.dmatrices("y ~ x1 * x2", data=df)
85- y = np.asarray(y).flatten()
86- labels = x.design_info.column_names
87- x = np.asarray(x)
82+ labels = ["Intercept", "x1", "x2", "x1:x2"]
83+ df["Intercept"] = np.ones(len(df))
84+ df["x1:x2"] = df["x1"] * df["x2"]
85+ # reorder columns to be in the same order as labels
86+ df = df[labels]
87+ x = df.to_numpy()
8888```
8989
9090Now we do a train-test split.
9191
9292``` {code-cell} ipython3
93- x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7)
93+ indices = rng.permutation(x.shape[0])
94+ train_prop = 0.7
95+ train_size = int(train_prop * x.shape[0])
96+ training_idx, test_idx = indices[:train_size], indices[train_size:]
97+ x_train, x_test = x[training_idx, :], x[test_idx, :]
98+ y_train, y_test = y[training_idx], y[test_idx]
9499```
95100
96101## Define and Fit the Model
97102
98103We now specify the model in PyMC.
99104
100105``` {code-cell} ipython3
101- COORDS = {"coeffs": labels}
106+ coords = {"coeffs": labels}
102107
103- with pm.Model(coords=COORDS ) as model:
108+ with pm.Model(coords=coords ) as model:
104109 # data containers
105110 X = pm.MutableData("X", x_train)
106111 y = pm.MutableData("y", y_train)
@@ -152,15 +157,15 @@ with model:
152157``` {code-cell} ipython3
153158# Compute the point prediction by taking the mean and defining the category via a threshold.
154159p_test_pred = idata.posterior_predictive["obs"].mean(dim=["chain", "draw"])
155- y_test_pred = (p_test_pred >= 0.5).astype("int")
160+ y_test_pred = (p_test_pred >= 0.5).astype("int").to_numpy()
156161```
157162
158163## Evaluate Model
159164
160165First let us compute the accuracy on the test set.
161166
162167``` {code-cell} ipython3
163- print(f"accuracy = {accuracy_score(y_true= y_test, y_pred =y_test_pred): 0.3f}")
168+ print(f"accuracy = {np.mean( y_test= =y_test_pred): 0.3f}")
164169```
165170
166171Next, we plot the [ roc curve] ( https://en.wikipedia.org/wiki/Receiver_operating_characteristic ) and compute the [ auc] ( https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve ) .
@@ -218,8 +223,13 @@ x1_grid, x2_grid, x_grid = make_grid()
218223
219224with model:
220225 # Create features on the grid.
221- x_grid_ext = patsy.dmatrix(formula_like="x1 * x2", data=dict(x1=x_grid[:, 0], x2=x_grid[:, 1]))
222- x_grid_ext = np.asarray(x_grid_ext)
226+ x_grid_ext = np.hstack(
227+ (
228+ np.ones((x_grid.shape[0], 1)),
229+ x_grid,
230+ (x_grid[:, 0] * x_grid[:, 1]).reshape(-1, 1),
231+ )
232+ )
223233 # set the observed variables
224234 pm.set_data({"X": x_grid_ext})
225235 # calculate pushforward values of `p`
@@ -278,12 +288,17 @@ Note that we have computed the model decision boundary by using the mean of the
278288- [ Bayesian Analysis with Python (Second edition) - Chapter 4] ( https://github.com/aloctavodia/BAP/blob/master/code/Chp4/04_Generalizing_linear_models.ipynb )
279289- [ Statistical Rethinking] ( https://xcelab.net/rm/statistical-rethinking/ )
280290
291+ +++
292+
293+
294+
281295+++
282296
283297## Authors
284298- Created by [ Juan Orduz] ( https://github.com/juanitorduz ) .
285299- Updated by [ Benjamin T. Vincent] ( https://github.com/drbenvincent ) to PyMC v4 in June 2022
286300- Re-executed by [ Benjamin T. Vincent] ( https://github.com/drbenvincent ) with PyMC v5 in December 2022
301+ - Updated by [ Christian Luhmann] ( https://github.com/cluhmann ) in December 2023 ([ pymc-examples #616 ] ( https://github.com/pymc-devs/pymc-examples/pull/616 ) )
287302
288303+++
289304
0 commit comments