import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy.stats import pearsonr

pd.options.display.float_format = "{:.4f}".format

plt.rcParams["figure.figsize"] = (10, 5.5)
plt.rcParams["axes.grid"] = True
plt.rcParams["grid.color"] = "lightgray"
plt.rcParams["grid.linewidth"] = 0.5
plt.rcParams["axes.spines.top"] = False
plt.rcParams["axes.spines.right"] = False

SCATTER_COLOR = "#4472C4"
LINE_COLOR = "firebrick"

panel = pd.read_csv("data/state_panel.csv")
us_trend = pd.read_csv("data/us_fatality_rate_by_year.csv")
nhtsa_raw = pd.read_csv("data/nhtsa_raw_1994_2015.csv")

print(f"State panel: {panel.shape[0]} rows, {panel.shape[1]} columns")
print(f"US trend:    {us_trend.shape[0]} rows")
print(f"NHTSA raw:   {nhtsa_raw.shape[0]} rows (state-year level)")
panel.head()

State panel: 50 rows, 7 columns
US trend:    22 rows
NHTSA raw:   1100 rows (state-year level)

rv = panel["fatalities_per_100k_1994_2015"]
print(f"Mean:   {rv.mean():.4f}")
print(f"Median: {rv.median():.4f}")
print(f"Min:    {rv.min():.4f}  ({panel.loc[rv.idxmin(), 'state_name']})")
print(f"Max:    {rv.max():.4f}  ({panel.loc[rv.idxmax(), 'state_name']})")

Mean:   15.2100
Median: 14.3784
Min:    6.4032  (Massachusetts)
Max:    29.7071  (Wyoming)

sorted_states = panel.sort_values("fatalities_per_100k_1994_2015", ascending=False)
fig, ax = plt.subplots(figsize=(13, 6))
ax.bar(sorted_states["state_abbr"], sorted_states["fatalities_per_100k_1994_2015"],
       color=SCATTER_COLOR, edgecolor="white")
ax.set_title("Average fatality rate per 100,000 residents by state (1994-2015)", fontsize=12)
ax.set_xlabel("State")
ax.set_ylabel("Fatalities per 100,000")
ax.tick_params(axis="x", rotation=90, labelsize=8)
ax.set_ylim(0, 32)
plt.tight_layout()
plt.show()

fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(us_trend["year"], us_trend["us_fatality_rate_per_100k"],
        marker="o", color=SCATTER_COLOR, linewidth=2)
ax.set_title("US fatality rate per 100,000, 1994-2015", fontsize=12)
ax.set_xlabel("Year")
ax.set_ylabel("Fatalities per 100,000")
ax.set_ylim(0, 18)
plt.tight_layout()
plt.show()

print(f"Average US rate 1994-2015: {us_trend['us_fatality_rate_per_100k'].mean():.4f} per 100K")
print(f"1994 rate: {us_trend.iloc[0]['us_fatality_rate_per_100k']:.4f}")
print(f"2015 rate: {us_trend.iloc[-1]['us_fatality_rate_per_100k']:.4f}")

Average US rate 1994-2015: 13.5255 per 100K
1994 rate: 15.6479
2015 rate: 10.9336

def regression_panel(df, x_col, y_col, x_label, title, xfmt=None, eq_xy=(0.05, 0.95)):
    x = df[x_col]
    y = df[y_col]
    r, p_value = pearsonr(x, y)

    X = sm.add_constant(x)
    model = sm.OLS(y, X).fit()
    slope = model.params[x_col]
    intercept = model.params["const"]
    r_squared = model.rsquared

    fig, ax = plt.subplots(figsize=(9, 5.5))
    ax.scatter(x, y, color=SCATTER_COLOR, alpha=0.75, edgecolors="white", s=60)
    x_line = np.linspace(x.min(), x.max(), 100)
    ax.plot(x_line, slope * x_line + intercept, color=LINE_COLOR, linewidth=2)
    ax.set_xlabel(x_label)
    ax.set_ylabel("Fatalities per 100,000")
    ax.set_title(title, fontsize=12)

    eqn_sign = "+" if intercept >= 0 else "-"
    eqn = f"y = {slope:.5g}x {eqn_sign} {abs(intercept):.5g}"
    ax.text(eq_xy[0], eq_xy[1],
            f"{eqn}\nr = {r:.4f}    R² = {r_squared:.4f}    p = {p_value:.4f}",
            transform=ax.transAxes, va="top", fontsize=10,
            bbox=dict(boxstyle="round,pad=0.4", facecolor="white", alpha=0.85, edgecolor="lightgray"))

    if xfmt:
        ax.xaxis.set_major_formatter(plt.FuncFormatter(xfmt))

    plt.tight_layout()
    plt.show()
    return {"r": r, "r_squared": r_squared, "slope": slope, "intercept": intercept, "p_value": p_value}

r1 = regression_panel(
    panel, "avg_miles_per_driver_2022", "fatalities_per_100k_1994_2015",
    "Average miles per driver (2022)",
    "Miles per driver vs fatality rate",
)

r2 = regression_panel(
    panel, "police_count_2020", "fatalities_per_100k_1994_2015",
    "Full-time state and local police count (2020)",
    "Police count vs fatality rate",
    eq_xy=(0.55, 0.95),
)

r3 = regression_panel(
    panel, "alcohol_gallons_per_capita_2023", "fatalities_per_100k_1994_2015",
    "Alcohol consumption (gallons per capita, 2023)",
    "Alcohol consumption vs fatality rate",
    eq_xy=(0.55, 0.95),
)

r4 = regression_panel(
    panel, "pct_pickup_trucks_2022", "fatalities_per_100k_1994_2015",
    "Pickup truck share of registered vehicles (2022)",
    "Pickup truck share vs fatality rate",
    xfmt=lambda x, _: f"{x*100:.0f}%",
    eq_xy=(0.05, 0.95),
)

X = panel[["avg_miles_per_driver_2022", "pct_pickup_trucks_2022"]]
X = sm.add_constant(X)
y = panel["fatalities_per_100k_1994_2015"]
joint_model = sm.OLS(y, X).fit()
print(joint_model.summary())

                                  OLS Regression Results                                 
=========================================================================================
Dep. Variable:     fatalities_per_100k_1994_2015   R-squared:                       0.496
Model:                                       OLS   Adj. R-squared:                  0.474
Method:                            Least Squares   F-statistic:                     23.12
Date:                           Sun, 17 May 2026   Prob (F-statistic):           1.02e-07
Time:                                   14:44:24   Log-Likelihood:                -137.49
No. Observations:                             50   AIC:                             281.0
Df Residuals:                                 47   BIC:                             286.7
Df Model:                                      2                                         
Covariance Type:                       nonrobust                                         
=============================================================================================
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                        -4.2598      2.921     -1.459      0.151     -10.135       1.616
avg_miles_per_driver_2022     0.0011      0.000      5.240      0.000       0.001       0.001
pct_pickup_trucks_2022       20.6490      9.121      2.264      0.028       2.300      38.998
==============================================================================
Omnibus:                        4.511   Durbin-Watson:                   1.545
Prob(Omnibus):                  0.105   Jarque-Bera (JB):                3.434
Skew:                           0.595   Prob(JB):                        0.180
Kurtosis:                       3.481   Cond. No.                     2.47e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.47e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

# Predicted vs actual diagnostic for the joint model
y_pred = joint_model.predict(X)
joint_r_squared = joint_model.rsquared

fig, ax = plt.subplots(figsize=(9, 5.5))
ax.scatter(y_pred, y, color=SCATTER_COLOR, alpha=0.75, edgecolors="white", s=60)
lim_lo, lim_hi = 0, max(y.max(), y_pred.max()) * 1.05
ax.plot([lim_lo, lim_hi], [lim_lo, lim_hi], color=LINE_COLOR, linewidth=2, linestyle="--")
ax.set_xlabel("Predicted fatality rate (joint model)")
ax.set_ylabel("Actual fatality rate")
ax.set_title("Joint regression: predicted vs actual fatality rate", fontsize=12)
ax.text(0.05, 0.95,
        f"R² = {joint_r_squared:.4f}    n = {len(y)}\nDashed line: y = x (perfect fit)",
        transform=ax.transAxes, va="top", fontsize=10,
        bbox=dict(boxstyle="round,pad=0.4", facecolor="white", alpha=0.85, edgecolor="lightgray"))
ax.set_xlim(lim_lo, lim_hi)
ax.set_ylim(lim_lo, lim_hi)
plt.tight_layout()
plt.show()

X = panel[["avg_miles_per_driver_2022", "police_count_2020",
           "alcohol_gallons_per_capita_2023", "pct_pickup_trucks_2022"]]
X = sm.add_constant(X)
y = panel["fatalities_per_100k_1994_2015"]
full_model = sm.OLS(y, X).fit()
print(full_model.summary())

                                  OLS Regression Results                                 
=========================================================================================
Dep. Variable:     fatalities_per_100k_1994_2015   R-squared:                       0.508
Model:                                       OLS   Adj. R-squared:                  0.464
Method:                            Least Squares   F-statistic:                     11.60
Date:                           Sun, 17 May 2026   Prob (F-statistic):           1.48e-06
Time:                                   14:44:24   Log-Likelihood:                -136.90
No. Observations:                             50   AIC:                             283.8
Df Residuals:                                 45   BIC:                             293.4
Df Model:                                      4                                         
Covariance Type:                       nonrobust                                         
===================================================================================================
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const                              -2.8230      4.338     -0.651      0.518     -11.560       5.914
avg_miles_per_driver_2022           0.0011      0.000      4.901      0.000       0.001       0.001
police_count_2020               -3.109e-05   3.28e-05     -0.949      0.348   -9.71e-05    3.49e-05
alcohol_gallons_per_capita_2023    -0.4488      1.095     -0.410      0.684      -2.654       1.756
pct_pickup_trucks_2022             20.6774      9.586      2.157      0.036       1.371      39.984
==============================================================================
Omnibus:                        3.391   Durbin-Watson:                   1.514
Prob(Omnibus):                  0.183   Jarque-Bera (JB):                2.412
Skew:                           0.500   Prob(JB):                        0.299
Kurtosis:                       3.399   Cond. No.                     4.29e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.29e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

pip install -r requirements.txt
jupyter notebook analysis.ipynb

Variable	Year	Source
`fatalities_per_100k_1994_2015` (response)	1994 to 2015	NHTSA FARS, normalised per 100,000 residents
`avg_miles_per_driver_2022`	2022	The Zebra, derived from Federal Highway Administration
`police_count_2020`	2020	Statista
`alcohol_gallons_per_capita_2023`	2023	World Population Review
`pct_pickup_trucks_2022`	2022	Industry data

	state_name	state_abbr	avg_miles_per_driver_2022	police_count_2020	alcohol_gallons_per_capita_2023	pct_pickup_trucks_2022	fatalities_per_100k_1994_2015
0	Alaska	AK	13090.0000	11372	2.8500	0.3080	11.7117
1	Alabama	AL	11111.0000	1230	1.9900	0.2050	22.3701
2	Arizona	AR	17224.0000	14658	2.2500	0.1510	22.1872
3	Arkansas	AZ	12524.0000	6570	1.7800	0.2430	17.9259
4	California	CA	12899.0000	93682	2.4900	0.1170	10.4285

Motor Vehicle Fatality Risk Across the United States¶

A Python port of a 2023 state-level investigative analysis¶

Question¶

Data¶

Method¶

Files¶

1. Load the data¶

2. Response variable: state-level fatality rate¶

3. The US-wide trend¶

4. Single-predictor analysis¶

4.1 Average miles driven per driver¶

4.2 Police force size¶

4.3 Alcohol consumption per capita¶

5. Joint regression on the two strongest predictors¶

6. Multivariate OLS with all four predictors¶

7. Findings¶

Limitations¶

How to reproduce¶

Motor Vehicle Fatality Risk Across the United States¶

A Python port of a 2023 state-level investigative analysis¶

Question¶

Data¶

Method¶

Files¶

1. Load the data¶

2. Response variable: state-level fatality rate¶

3. The US-wide trend¶

4. Single-predictor analysis¶

4.1 Average miles driven per driver¶

4.2 Police force size¶

4.3 Alcohol consumption per capita¶

4.4 Pickup truck share¶

5. Joint regression on the two strongest predictors¶

6. Multivariate OLS with all four predictors¶

7. Findings¶

Limitations¶

How to reproduce¶