2  Regression and Other Stories: Height and weight

Farhan Reynaldo

Nov 11th, 2021

from pathlib import Path

import arviz as az
import bambi as bmb
import matplotlib.pyplot as plt
import pandas as pd

DATA_PATH = Path("../../data")
az.style.use("arviz-white")
earnings = pd.read_csv(DATA_PATH / "earnings.csv")
earnings.head()
height weight male earn earnk ethnicity education mother_education father_education walk exercise smokenow tense angry age
0 74 210.0 1 50000.0 50.0 White 16.0 16.0 16.0 3 3 2.0 0.0 0.0 45
1 66 125.0 0 60000.0 60.0 White 16.0 16.0 16.0 6 5 1.0 0.0 0.0 58
2 64 126.0 0 30000.0 30.0 White 16.0 16.0 16.0 8 1 2.0 1.0 1.0 29
3 65 200.0 0 25000.0 25.0 White 17.0 17.0 NaN 8 1 2.0 0.0 0.0 57
4 63 110.0 0 50000.0 50.0 Other 16.0 16.0 16.0 5 6 2.0 0.0 0.0 91

2.1 Simulating uncertainty for linear predictors and predicted values

2.1.1 Predict weight (in pounds) from height (in inches)

model_1 = bmb.Model("weight ~ height", data=earnings, dropna=True)
idata_1 = model_1.fit()
az.summary(idata_1, kind="stats")
100.00% [8000/8000 00:03<00:00 Sampling 4 chains, 0 divergences]
mean sd hdi_3% hdi_97%
Intercept -172.922 11.899 -194.217 -149.535
height 4.944 0.179 4.597 5.263
weight_sigma 28.980 0.489 28.062 29.889

Predict weight for 66 inches person

stats_1 = az.summary(idata_1, kind="stats")["mean"]
predicted_1 = stats_1["Intercept"] + stats_1["height"] * 66
round(predicted_1, 1)
153.4

or

new = pd.DataFrame(dict(height=[66]))
model_1.predict(idata_1, data=new)
predicted_1 = idata_1.posterior["weight_mean"].values.mean()
round(predicted_1, 1)
153.4

2.1.2 Center heights

earnings["c_height"] = earnings["height"] - 66
model_2 = bmb.Model("weight ~ c_height", data=earnings, dropna=True)
idata_2 = model_2.fit()
az.summary(idata_2, kind="stats")
100.00% [8000/8000 00:04<00:00 Sampling 4 chains, 0 divergences]
mean sd hdi_3% hdi_97%
Intercept 153.373 0.698 152.118 154.718
c_height 4.948 0.179 4.628 5.290
weight_sigma 28.961 0.486 28.057 29.855

2.1.3 Point prediction

new = pd.DataFrame(dict(c_height=[4]))
model_2.predict(idata_2, data=new)
predicted_2 = idata_2.posterior["weight_mean"].values.mean()
round(predicted_2, 1)
173.2

2.1.4 Posterior simulations

variation coming from posterior uncertainty in the coefficients

linpred_2 = model_2.predict(idata_2, data=new, inplace=False)
plt.hist(linpred_2.posterior["weight_mean"].values.ravel(), color="gray");

2.1.5 Posterior predictive simulations

variation coming from posterior uncertainty in the coefficients and predictive uncertainty

postpred_2 = model_2.predict(idata_2, data=new, kind="pps", inplace=False)
plt.hist(postpred_2.posterior_predictive["weight"].values.ravel(), color="gray");

2.2 Indicator variables

2.2.1 Predict weight (in pounds) from height (in inches)

new = pd.DataFrame(dict(height=[66]))
pred = model_1.predict(idata_1, data=new, kind="pps", inplace=False)
pred_mean = pred.posterior_predictive["weight"].values.mean()
pred_sd = pred.posterior_predictive["weight"].values.std()
print(
    f"Predicted weight for a 66-inch-tall person is {round(pred_mean)} pounds with a sd of {round(pred_sd)}"
)
Predicted weight for a 66-inch-tall person is 153 pounds with a sd of 29

2.2.2 Including a binary variable in a regression

model_3 = bmb.Model("weight ~ c_height + male", data=earnings, dropna=True)
idata_3 = model_3.fit()
az.summary(idata_3, kind="stats")
100.00% [8000/8000 00:06<00:00 Sampling 4 chains, 0 divergences]
mean sd hdi_3% hdi_97%
Intercept 149.530 0.951 147.783 151.342
c_height 3.881 0.253 3.418 4.374
male 11.867 2.028 8.027 15.691
weight_sigma 28.690 0.487 27.762 29.588
new = pd.DataFrame(dict(c_height=[4], male=[0]))
pred = model_3.predict(idata_3, data=new, kind="pps", inplace=False)
pred_mean = pred.posterior_predictive["weight"].values.mean()
pred_sd = pred.posterior_predictive["weight"].values.std()
print(
    f"Predicted weight for a 70-inch-tall female is {round(pred_mean)} pounds with a sd of {round(pred_sd)}"
)
Predicted weight for a 70-inch-tall female is 165 pounds with a sd of 29

2.2.3 Using indicator variables for multiple levels of a categorical predictor

Include ethnicity in the regression as a factor

model_4 = bmb.Model(
    "weight ~ c_height + male + C(ethnicity)", data=earnings, dropna=True
)
idata_4 = model_4.fit()
az.summary(idata_4, kind="stats")
100.00% [8000/8000 00:04<00:00 Sampling 4 chains, 0 divergences]
mean sd hdi_3% hdi_97%
Intercept 154.356 2.208 150.135 158.402
c_height 3.851 0.252 3.380 4.308
male 12.112 1.988 8.309 15.860
C(ethnicity)[Hispanic] -6.132 3.546 -12.492 0.720
C(ethnicity)[Other] -12.239 5.125 -20.977 -1.720
C(ethnicity)[White] -5.208 2.261 -9.204 -0.890
weight_sigma 28.641 0.467 27.777 29.504

2.2.4 Choose the baseline category by setting the levels

earnings["eth"] = pd.Categorical(
    earnings["ethnicity"],
    categories=["White", "Black", "Hispanic", "Other"],
    ordered=True,
)
model_5 = bmb.Model("weight ~ c_height + male + eth", data=earnings, dropna=True)
idata_5 = model_5.fit()
az.summary(idata_5, kind="stats")
100.00% [8000/8000 00:04<00:00 Sampling 4 chains, 0 divergences]
mean sd hdi_3% hdi_97%
Intercept 149.133 0.971 147.309 150.953
c_height 3.857 0.252 3.388 4.334
male 12.077 1.995 8.223 15.741
eth[Black] 5.226 2.288 1.105 9.759
eth[Hispanic] -0.924 2.954 -6.332 4.814
eth[Other] -7.173 4.819 -15.649 2.175
weight_sigma 28.656 0.477 27.747 29.518

2.2.5 Alternatively create indicators for the four ethnic groups directly:

earnings["eth_white"] = earnings["ethnicity"] == "White"
earnings["eth_black"] = earnings["ethnicity"] == "Black"
earnings["eth_hispanic"] = earnings["ethnicity"] == "Hispanic"
earnings["eth_other"] = earnings["ethnicity"] == "Other"
model_6 = bmb.Model(
    "weight ~ c_height + male + eth_black + eth_hispanic + eth_other",
    data=earnings,
    dropna=True,
)
idata_6 = model_6.fit()
az.summary(idata_6, kind="stats")
100.00% [8000/8000 00:15<00:00 Sampling 4 chains, 0 divergences]
mean sd hdi_3% hdi_97%
Intercept 149.125 0.979 147.387 151.050
c_height 3.851 0.252 3.390 4.330
male 12.120 1.976 8.395 15.663
eth_black 5.191 2.225 1.037 9.428
eth_hispanic -0.970 2.937 -6.516 4.384
eth_other -7.013 4.819 -16.082 2.245
weight_sigma 28.656 0.474 27.810 29.576