This notebook demostrates the following dimention reduction method and the corresponding 2D visualization.
from sklearn.decomposition import PCA
from sklearn.manifold import LocallyLinearEmbedding, Isomap, MDS, SpectralEmbedding, TSNE
import numpy as np
from collections import OrderedDict
from time import time
from functools import partial
from matplotlib.ticker import NullFormatter
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
def vis_manifold(x, y = None, first_nval = None, n_neighbors = 10, n_components = 3, n_limit = 10000):
'''quick dimention reduction for panel data using sklearn.manifold functionality
Attributes:
x: dataframe
y: (this is optional) series as target
first_nval: if y is None, then show 2d visualization colored by first n col in x
n_neighbors: n_neighbors in manifold method
n_limit: if x has more than n_limit datapoint, downsample to n_limit (stratify if y presents)
'''
# parameters
figsize = (18, 3)
colname_ls = x.columns
# diagnosis
if y is not None:
if np.rank(y) != 1:
raise TypeError('np.rank(y) != 1')
if (x.shape[0] == len(y)) is not True:
raise TypeError('Dimention dismatch: x.shape[0], len(y)')
if len(y.unique()) <= n_neighbors:
n_neighbors = len(y.unique())
print('parameter update: len(y.unique()) = %d => n_neighbors' % (n_neighbors))
# preprocessing
if x.shape[0] > n_limit:
if y is not None:
x_index, _, y, _ = train_test_split(range(x.shape[0]), y, train_size = n_limit/ x.shape[0],
stratify = y, random_state = 42)
y = (y - y.mean()) / y.std()
x = x.iloc[x_index,:].reset_index(drop = True)
print('parameter update: stratified downsample to %d' % (n_limit))
else:
x = x.sample(n = n_limit, random_state=1).reset_index(drop=True)
print('parameter update: downsample to %d' % (n_limit))
scaler = StandardScaler()
x = scaler.fit_transform(x)
# instantiated
methods = OrderedDict()
methods['pca'] = PCA(n_components = n_components)
methods['SE'] = SpectralEmbedding(n_components=n_components, n_neighbors=n_neighbors)
methods['Isomap'] = Isomap(n_neighbors, n_components)
methods['t-SNE'] = TSNE(n_components=n_components, init='pca', random_state=0)
methods['MDS'] = MDS(n_components, max_iter=100, n_init=1)
LLE = partial(LocallyLinearEmbedding, n_neighbors, n_components, eigen_solver='auto')
methods['LLE'] = LLE(method='standard')
# manifold learning
Y = np.empty((len(methods), x.shape[0], n_components), dtype=float)
t = np.empty((len(methods)), dtype=float)
for i, (label, method) in enumerate(methods.items()):
t0 = time()
Y[i,] = method.fit_transform(x)
t[i] = time() - t0
print("%s: %.2g sec" % (label, t[i]))
if label == 'pca':
plt.bar(range(n_components), method.explained_variance_ratio_)
plt.plot(range(n_components), np.cumsum(method.explained_variance_ratio_), 'o-')
plt.title("Component-wise and Cumulative Explained Variance")
# Create figure
if y is not None:
fig = plt.figure(figsize=figsize)
fig.suptitle("Manifold Learning : (color = %s)" % (y.name), fontsize=14)
for i, (label, method) in enumerate(methods.items()):
ax = fig.add_subplot(1, len(methods), 1 + i)
plt.scatter(Y[i,][:, 0], Y[i,][:, 1], c = np.array(y),
marker='.', cmap=plt.cm.Spectral)
plt.xlabel("%s" % (label))
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.show()
else:
fig = plt.figure(figsize=figsize)
fig.suptitle("Manifold Learning",
fontsize=14)
for i, (label, method) in enumerate(methods.items()):
ax = fig.add_subplot(1, len(methods), 1 + i)
plt.scatter(Y[i,][:, 0], Y[i,][:, 1],
marker='.', cmap=plt.cm.Spectral)
plt.xlabel("%s" % (label))
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.show()
if first_nval is not None:
first_nval -= 1
for j in range(len(colname_ls)):
fig = plt.figure(figsize=figsize)
fig.suptitle("Manifold Learning : (color = %s)" % (colname_ls.values[j]), fontsize=14)
for i, (label, method) in enumerate(methods.items()):
ax = fig.add_subplot(1, len(methods), 1 + i)
plt.scatter(Y[i,][:, 0], Y[i,][:, 1], c = x[:,j],
marker='.', cmap=plt.cm.Spectral)
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.show()
if first_nval <= j:
break
return methods
from sklearn.datasets import make_classification
x, y = make_classification(n_samples=3000, n_features = 10, n_classes = 3,
n_informative = 3, random_state=0)
x = pd.DataFrame(x, columns = [ 'col'+str(i) for i in range(x.shape[1]) ])
y = pd.Series(y, name = 'target')
x
y
manifold_models = vis_manifold(x = x, y = y, first_nval = 5)