1 Regression and Other Stories: KidIQ

Farhan Reynaldo

Nov 11th, 2021

from pathlib import Path

import arviz as az
import bambi as bmb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

DATA_PATH = Path("../../data")

rng = np.random.default_rng(seed=12)
az.style.use("arviz-white")

kidiq = pd.read_csv(DATA_PATH / "kidiq.csv")
kidiq.head()

	kid_score	mom_hs	mom_iq	mom_work	mom_age
0	65	1	121.117529	4	27
1	98	1	89.361882	4	25
2	85	1	115.443165	4	27
3	83	1	99.449639	3	25
4	115	1	92.745710	4	27

1.1 A single predictor

1.1.1 A single binary predictor

model_1 = bmb.Model("kid_score ~ mom_hs", data=kidiq)
idata_1 = model_1.fit()
az.summary(idata_1, kind="stats")

100.00% [8000/8000 00:04<00:00 Sampling 4 chains, 0 divergences]

	mean	sd	hdi_3%	hdi_97%
Intercept	77.552	2.130	73.627	81.545
mom_hs	11.791	2.371	7.309	16.157
kid_score_sigma	19.886	0.676	18.673	21.184

1.1.2 A single continuous predictor

model_2 = bmb.Model("kid_score ~ mom_iq", data=kidiq)
idata_2 = model_2.fit()
az.summary(idata_2, kind="stats")

100.00% [8000/8000 00:06<00:00 Sampling 4 chains, 0 divergences]

	mean	sd	hdi_3%	hdi_97%
Intercept	25.646	5.921	15.310	37.783
mom_iq	0.612	0.059	0.504	0.727
kid_score_sigma	18.312	0.627	17.109	19.448

1.1.3 Displaying a regression line as a function of one input variable

stats_2 = az.summary(idata_2, kind="stats")["mean"]
x = np.arange(kidiq["mom_iq"].min(), kidiq["mom_iq"].max())
y_pred = stats_2["Intercept"] + stats_2["mom_iq"] * x

fig, ax = plt.subplots()
ax.scatter("mom_iq", "kid_score", data=kidiq, color="black", s=10)
ax.set_xlabel("Mother IQ score")
ax.set_ylabel("Child test score")
ax.plot(x, y_pred, color="black")

1.2 Two predictors

1.2.1 Linear regression

model_3 = bmb.Model("kid_score ~ mom_hs + mom_iq", data=kidiq)
idata_3 = model_3.fit()
az.summary(idata_3, kind="stats")

100.00% [8000/8000 00:13<00:00 Sampling 4 chains, 0 divergences]

	mean	sd	hdi_3%	hdi_97%
Intercept	25.678	5.946	14.861	37.316
mom_hs	5.950	2.201	1.996	10.288
mom_iq	0.564	0.061	0.449	0.676
kid_score_sigma	18.161	0.624	17.068	19.377

1.3 Graphical displays of data and fitted models

1.3.1 Two fitted regression lines – model with no interaction

stats_3 = az.summary(idata_3, kind="stats")["mean"]
x = np.arange(kidiq["mom_iq"].min(), kidiq["mom_iq"].max())
y_pred_mom_hs_1 = stats_3["Intercept"] + stats_3["mom_hs"] + stats_3["mom_iq"] * x
y_pred_mom_hs_0 = stats_3["Intercept"] + stats_3["mom_iq"] * x

fig, ax = plt.subplots()
ax.scatter("mom_iq", "kid_score", data=kidiq.query("mom_hs == 1"), color="black", s=10)
ax.scatter("mom_iq", "kid_score", data=kidiq.query("mom_hs == 0"), color="gray", s=10)
ax.plot(x, y_pred_mom_hs_1, color="black")
ax.plot(x, y_pred_mom_hs_0, color="gray")

1.3.2 Two fitted regression lines – model with interaction

model_4 = bmb.Model("kid_score ~ mom_hs + mom_iq + mom_hs:mom_iq", data=kidiq)
idata_4 = model_4.fit()
az.summary(idata_4, kind="stats")

100.00% [8000/8000 00:32<00:00 Sampling 4 chains, 0 divergences]

	mean	sd	hdi_3%	hdi_97%
Intercept	-9.671	13.954	-35.952	16.048
mom_hs	49.183	15.520	21.813	79.798
mom_iq	0.949	0.151	0.665	1.227
mom_hs:mom_iq	-0.462	0.164	-0.764	-0.154
kid_score_sigma	18.000	0.617	16.815	19.118

stats_4 = az.summary(idata_4, kind="stats")["mean"]
x = np.arange(kidiq["mom_iq"].min(), kidiq["mom_iq"].max())
y_pred_mom_hs_1 = (
    stats_4["Intercept"]
    + stats_4["mom_hs"]
    + (stats_4["mom_hs:mom_iq"] + stats_4["mom_iq"]) * x
)
y_pred_mom_hs_0 = stats_4["Intercept"] + stats_4["mom_iq"] * x

fig, ax = plt.subplots()
ax.scatter("mom_iq", "kid_score", data=kidiq.query("mom_hs == 1"), color="black", s=10)
ax.scatter("mom_iq", "kid_score", data=kidiq.query("mom_hs == 0"), color="gray", s=10)
ax.plot(x, y_pred_mom_hs_1, color="black")
ax.plot(x, y_pred_mom_hs_0, color="gray");

1.4 Displaying uncertainty in the fitted regression

1.4.1 A single continuous predictor

index = rng.choice(len(idata_2.posterior["draw"].values), 10)
intercept_sample = idata_2.posterior["Intercept"][0, index].values
slope_sample = idata_2.posterior["mom_iq"][0, index].values
x = np.arange(kidiq["mom_iq"].min(), kidiq["mom_iq"].max())
y_pred = (
    idata_2.posterior["Intercept"].values.mean()
    + idata_2.posterior["mom_iq"].values.mean() * x
)

fig, ax = plt.subplots()
ax.scatter("mom_iq", "kid_score", data=kidiq, color="black", s=10)
for intercept, slope in zip(intercept_sample, slope_sample):
    ax.plot(x, intercept + slope * x, color="gray", alpha=0.5)
ax.plot(x, y_pred, color="black")

ax.set_xlabel("Mother IQ score")
ax.set_ylabel("Child test score");

1.4.2 Two predictors

# TODO: add plot code here

1.4.3 Center predictors to have zero mean

kidiq["c_mom_hs"] = kidiq["mom_hs"] - kidiq["mom_hs"].mean()
kidiq["c_mom_iq"] = kidiq["mom_iq"] - kidiq["mom_iq"].mean()
model_4c = bmb.Model(
    "kid_score ~ c_mom_hs + c_mom_iq + c_mom_hs:c_mom_iq",
    data=kidiq,
)
idata_4c = model_4c.fit()
az.summary(idata_4c)

100.00% [8000/8000 00:07<00:00 Sampling 4 chains, 0 divergences]

	mean	sd	hdi_3%	hdi_97%	mcse_mean	mcse_sd	ess_bulk	ess_tail	r_hat
Intercept	87.640	0.912	85.894	89.346	0.014	0.010	4003.0	3038.0	1.0
c_mom_hs	2.755	2.368	-1.498	7.267	0.040	0.029	3415.0	2573.0	1.0
c_mom_iq	0.590	0.061	0.479	0.705	0.001	0.001	4810.0	3095.0	1.0
c_mom_hs:c_mom_iq	-0.485	0.159	-0.772	-0.175	0.002	0.002	4816.0	3545.0	1.0
kid_score_sigma	18.002	0.605	16.930	19.136	0.008	0.006	5254.0	3572.0	1.0

1.4.4 Center predictors based on a reference point

kidiq["c2_mom_hs"] = kidiq["mom_hs"] - 0.5
kidiq["c2_mom_iq"] = kidiq["mom_iq"] - 100
model_4c2 = bmb.Model(
    "kid_score ~ c2_mom_hs + c2_mom_iq + c2_mom_hs:c2_mom_iq",
    data=kidiq,
)
idata_4c2 = model_4c2.fit()
az.summary(idata_4c2)

100.00% [8000/8000 00:07<00:00 Sampling 4 chains, 0 divergences]

	mean	sd	hdi_3%	hdi_97%	mcse_mean	mcse_sd	ess_bulk	ess_tail	r_hat
Intercept	86.818	1.210	84.457	88.981	0.025	0.017	2387.0	2341.0	1.0
c2_mom_hs	2.883	2.386	-1.652	7.248	0.049	0.035	2381.0	2572.0	1.0
c2_mom_iq	0.726	0.082	0.582	0.888	0.002	0.001	2074.0	2479.0	1.0
c2_mom_hs:c2_mom_iq	-0.482	0.163	-0.799	-0.188	0.003	0.002	2172.0	2444.0	1.0
kid_score_sigma	18.000	0.619	16.896	19.269	0.010	0.007	3772.0	3164.0	1.0

1.4.5 Center and scale predictors to have zero mean and sd=1/2

kidiq["z_mom_hs"] = (kidiq["mom_hs"] - kidiq["mom_hs"].mean()) / (
    2 * kidiq["mom_hs"].std()
)
kidiq["z_mom_iq"] = (kidiq["mom_iq"] - kidiq["mom_iq"].mean()) / (
    2 * kidiq["mom_iq"].std()
)
model_4c = bmb.Model(
    "kid_score ~ z_mom_hs + z_mom_iq + z_mom_hs:z_mom_iq",
    data=kidiq,
)
idata_4c = model_4c.fit()
az.summary(idata_4c)

100.00% [8000/8000 00:07<00:00 Sampling 4 chains, 0 divergences]

	mean	sd	hdi_3%	hdi_97%	mcse_mean	mcse_sd	ess_bulk	ess_tail	r_hat
Intercept	87.648	0.937	85.868	89.388	0.012	0.009	6013.0	3314.0	1.0
z_mom_hs	2.299	2.018	-1.590	6.002	0.034	0.025	3598.0	3456.0	1.0
z_mom_iq	17.676	1.846	14.433	21.301	0.027	0.019	4519.0	3154.0	1.0
z_mom_hs:z_mom_iq	-12.069	3.996	-19.596	-4.583	0.064	0.047	3939.0	3062.0	1.0
kid_score_sigma	18.002	0.616	16.880	19.145	0.009	0.006	5097.0	3149.0	1.0

1.4.6 Predict using working status of mother

model_5 = bmb.Model("kid_score ~ C(mom_work)", data=kidiq)
idata_5 = model_5.fit()
az.summary(idata_5, kind="stats")

100.00% [8000/8000 00:04<00:00 Sampling 4 chains, 0 divergences]

	mean	sd	hdi_3%	hdi_97%
Intercept	81.994	2.292	77.896	86.441
C(mom_work)[2]	3.866	3.090	-1.804	9.844
C(mom_work)[3]	11.504	3.561	4.844	18.084
C(mom_work)[4]	5.213	2.672	-0.025	9.851
kid_score_sigma	20.289	0.696	19.009	21.646