4. McFadden’s Psuedo R^2
Here’s a simple way to compute McFadden’s psuedo R^2.
4.1. Simulate data
[1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from numpy.random import binomial, normal
from scipy.stats import bernoulli, binom
np.random.seed(37)
sns.set(color_codes=True)
def get_data():
n = 10_000
X = np.hstack([
np.ones(n).reshape(n, 1),
normal(0.0, 1.0, n).reshape(n, 1),
normal(0.0, 1.0, n).reshape(n, 1)
])
z = np.dot(X, np.array([1.0, 2.0, 3.0])) + normal(0.0, 1.0, n)
p = 1.0 / (1.0 + np.exp(-z))
y = binom.rvs(1, p)
return pd.DataFrame(X, columns=['intercept', 'x1', 'x2']) \
.assign(y=y)
df = get_data()
df.shape
[1]:
(10000, 4)
4.2. Create Xy
[2]:
f_cols = [c for c in df.columns if c != 'y']
n_cols = ['intercept']
X, y = df[[c for c in df.columns if c != 'y']], df['y']
X.shape, y.shape
[2]:
((10000, 3), (10000,))
4.3. Create the full and null models
[3]:
from sklearn.linear_model import LogisticRegression
f_lr = LogisticRegression(fit_intercept=False, solver='lbfgs')
n_lr = LogisticRegression(fit_intercept=False, solver='lbfgs')
f_lr.fit(X[f_cols], y)
n_lr.fit(X[n_cols], y)
[3]:
LogisticRegression(fit_intercept=False)
4.4. Compute psuedo R^2
[4]:
from sklearn.metrics import log_loss
f_llh = log_loss(y, f_lr.predict_proba(X[f_cols])[:,1])
n_llh = log_loss(y, n_lr.predict_proba(X[n_cols])[:,1])
f_llh, n_llh, 1 - (f_llh / n_llh)
[4]:
(0.34129810791617254, 0.6759731198350962, 0.4951010655580028)