6. Data Imputation with Autoencoders
Autoencoders may be used for data imputation. Let’s see how data imputation with autoencoder works.
6.1. Data
The data is sampled as follows.
\(X_0 \sim \mathcal{N}(0, 1)\)
\(X_1 \sim \mathcal{N}(1.1 + 4 X_0, 1)\)
\(X_2 \sim \mathcal{N}(2.3 - 0.5 X_0, 1)\)
[1]:
import numpy as np
import random
np.random.seed(37)
random.seed(37)
size = 1_000
X_0 = np.random.normal(0, 1, size=size)
X_1 = 1.1 + 4 * X_0 + np.random.normal(0, 1, size=size)
X_2 = 2.3 - 0.5 * X_0 + np.random.normal(0, 1, size=size)
X = np.hstack([X_0.reshape(-1, 1), X_1.reshape(-1, 1), X_2.reshape(-1, 1)])
X.shape
[1]:
(1000, 3)
6.2. Missing data
We will make 10% of the data missing randomly.
[2]:
import itertools
import pandas as pd
def make_missing(X, frac=0.1):
n = int(frac * X.shape[0] * X.shape[1])
rows = list(range(X.shape[0]))
cols = list(range(X.shape[1]))
coordinates = list(itertools.product(*[rows, cols]))
random.shuffle(coordinates)
coordinates = coordinates[:n]
M = np.copy(X)
for r, c in coordinates:
M[r, c] = np.nan
return pd.DataFrame(M, columns=[f'X_{i}' for i in range(X.shape[1])]), coordinates
df, coordinates = make_missing(X)
[3]:
df.isna().sum()
[3]:
X_0 99
X_1 102
X_2 99
dtype: int64
[4]:
df.isna().sum().sum()
[4]:
300
[5]:
import missingno as msno
_ = msno.matrix(df)
Denote the following.
N: represents data that is not missing (will be used for training)
T: represents data that is ground truth for missing data (will be used for validation)
M: represents data that is missing (will be used for testing)
[6]:
N_df = df.dropna()
T_df = pd.DataFrame(X[df.isnull().any(axis=1), :], columns=N_df.columns)
M_df = df[df.isnull().any(axis=1)]
N_df.shape, T_df.shape, M_df.shape
[6]:
((732, 3), (268, 3), (268, 3))
[7]:
T_df.iloc[0]
[7]:
X_0 -0.054464
X_1 0.222191
X_2 2.381753
Name: 0, dtype: float64
[8]:
M_df.iloc[0]
[8]:
X_0 -0.054464
X_1 NaN
X_2 2.381753
Name: 0, dtype: float64
6.3. Dataset, Data Loader
We will have to create our datasets and data loaders.
[9]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import *
class SampleDataset(Dataset):
def __init__(self, X, device, clazz=0):
self.__device = device
self.__clazz = clazz
self.__X = X
def __len__(self):
return self.__X.shape[0]
def __getitem__(self, idx):
item = self.__X[idx,:]
return item, self.__clazz
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
N_ds = SampleDataset(X=N_df.values, device=device)
M_ds = SampleDataset(X=M_df.fillna(0.0).values, device=device)
N_dl = DataLoader(N_ds, batch_size=64, shuffle=True, num_workers=1)
M_dl = DataLoader(M_ds, batch_size=64, shuffle=True, num_workers=1)
cuda
6.4. Autoencoder
The two autoencoder architectures are adopted from the following.
[10]:
from torchvision import datasets
from torchvision import transforms
class AE1(torch.nn.Module):
def __init__(self, input_size):
super().__init__()
self.input_size = input_size
self.drop_out = torch.nn.Dropout(p=0.5)
self.encoder = torch.nn.Sequential(
torch.nn.Linear(input_size, 128),
torch.nn.ReLU(),
torch.nn.Linear(128, 64),
torch.nn.ReLU(),
torch.nn.Linear(64, 36),
torch.nn.ReLU(),
torch.nn.Linear(36, 18),
torch.nn.ReLU(),
torch.nn.Linear(18, 9)
)
self.decoder = torch.nn.Sequential(
torch.nn.Linear(9, 18),
torch.nn.ReLU(),
torch.nn.Linear(18, 36),
torch.nn.ReLU(),
torch.nn.Linear(36, 64),
torch.nn.ReLU(),
torch.nn.Linear(64, 128),
torch.nn.ReLU(),
torch.nn.Linear(128, input_size)
)
def forward(self, x):
drop_out = self.drop_out(x)
encoded = self.encoder(drop_out)
decoded = self.decoder(encoded)
return decoded
class AE2(torch.nn.Module):
def __init__(self, dim, theta=7):
super().__init__()
self.dim = dim
self.theta = theta
self.drop_out = torch.nn.Dropout(p=0.5)
self.encoder = torch.nn.Sequential(
torch.nn.Linear(dim+theta*0, dim+theta*1),
torch.nn.Tanh(),
torch.nn.Linear(dim+theta*1, dim+theta*2),
torch.nn.Tanh(),
torch.nn.Linear(dim+theta*2, dim+theta*3)
)
self.decoder = torch.nn.Sequential(
torch.nn.Linear(dim+theta*3, dim+theta*2),
torch.nn.Tanh(),
torch.nn.Linear(dim+theta*2, dim+theta*1),
torch.nn.Tanh(),
torch.nn.Linear(dim+theta*1, dim+theta*0)
)
def forward(self, x):
x = x.view(-1, self.dim)
x_missed = self.drop_out(x)
z = self.encoder(x_missed)
out = self.decoder(z)
out = out.view(-1, self.dim)
return out
6.5. Learning
We will train two autoencoder models and compare how they perform with data imputation.
[11]:
def train(model, optimizer):
loss_function = torch.nn.MSELoss()
epochs = 20
loss_df = []
for epoch in range(epochs):
losses = []
for (items, _) in N_dl:
items = items.to(device)
optimizer.zero_grad()
reconstructed = model(items)
loss = loss_function(reconstructed, items)
loss.backward()
optimizer.step()
losses.append(loss.detach().cpu().numpy().item())
losses = np.array(losses)
loss_df.append({
'epoch': epoch + 1,
'loss': losses.mean()
})
loss_df = pd.DataFrame(loss_df)
loss_df.index = loss_df['epoch']
loss_df = loss_df.drop(columns=['epoch'])
return loss_df
[12]:
model_1 = AE1(input_size=N_df.shape[1]).double().to(device)
opt_1 = torch.optim.Adam(model_1.parameters(), lr=1e-3, weight_decay=1e-8)
loss_1 = train(model_1, opt_1)
[13]:
model_2 = AE2(dim=N_df.shape[1]).double().to(device)
opt_2 = torch.optim.SGD(model_2.parameters(), momentum=0.99, lr=0.01, nesterov=True)
loss_2 = train(model_2, opt_2)
[14]:
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
ax = loss_1['loss'].plot(kind='line', figsize=(15, 4), title='MSE Loss', ylabel='MSE', label='AE1')
_ = loss_2['loss'].plot(kind='line', figsize=(15, 4), title='MSE Loss', ylabel='MSE', ax=ax, label='AE2')
_ = ax.set_xticks(list(range(1, 21, 1)))
_ = ax.legend()
6.6. Imputation
Now we will impute the data using the two autoencoders. The performance will be the average L2 distance between the imputed and true data.
[15]:
def predict(m, items, device):
return m(items.to(device)).cpu().detach().numpy()
def get_imputation(m_v, p_v, t_v):
def get_value(m, p, t):
return p if pd.isna(m) else t
return np.array([get_value(m, p, t) for m, p, t in zip(m_v, p_v, t_v)])
def get_performance(model):
N_pred = np.vstack([predict(model, items, device) for items, _ in N_dl])
M_pred = np.vstack([predict(model, items, device) for items, _ in M_dl])
I_pred = np.array([get_imputation(M_df.values[r,:], M_pred[r,:], T_df.values[r,:]) for r in range(M_df.shape[0])])
n_perf = np.array([np.linalg.norm(N_df.values[r,:] - N_pred[r,:], 2) for r in range(N_df.shape[0])]).mean()
m_perf = np.array([np.linalg.norm(T_df.values[r,:] - I_pred[r,:], 2) for r in range(T_df.shape[0])]).mean()
return n_perf, m_perf
The first value is the training performance and the second value is the testing/validation performance. Lower is better. The results for the first autoencoder method is shown below.
[16]:
get_performance(model_1)
[16]:
(4.419058376740261, 2.0210263067003247)
The results for the second autoencoder method is shown below.
[17]:
get_performance(model_2)
[17]:
(4.414670724819407, 1.9455886899656865)