This notebook demostrates the following dimention reduction method and the corresponding 2D visualization.

vis_manifold

Function

In [1]:
from sklearn.decomposition import PCA
from sklearn.manifold import LocallyLinearEmbedding, Isomap, MDS, SpectralEmbedding, TSNE
import numpy as np
from collections import OrderedDict
from time import time
from functools import partial
from matplotlib.ticker import NullFormatter
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

def vis_manifold(x, y = None, first_nval = None, n_neighbors = 10, n_components = 3, n_limit = 10000):
    
    '''quick dimention reduction for panel data using sklearn.manifold functionality
    
    Attributes:
        x: dataframe
        y: (this is optional) series as target 
        first_nval: if y is None, then show 2d visualization colored by first n col in x
        n_neighbors: n_neighbors in manifold method
        n_limit: if x has more than n_limit datapoint, downsample to n_limit (stratify if y presents)
    '''
    # parameters
    figsize = (18, 3)
    colname_ls = x.columns
    
    # diagnosis
    if y is not None:
        if np.rank(y) != 1:
            raise TypeError('np.rank(y) != 1')
        if (x.shape[0] == len(y)) is not True:
            raise TypeError('Dimention dismatch: x.shape[0], len(y)')
        if len(y.unique()) <= n_neighbors:
            n_neighbors = len(y.unique())
            print('parameter update: len(y.unique()) = %d => n_neighbors' % (n_neighbors))
            
    # preprocessing
    if x.shape[0] > n_limit:
        if y is not None:
            x_index, _, y, _ = train_test_split(range(x.shape[0]), y, train_size = n_limit/ x.shape[0], 
                                                stratify = y, random_state = 42)
            y = (y - y.mean()) / y.std()
            x = x.iloc[x_index,:].reset_index(drop = True)
            print('parameter update: stratified downsample to %d' % (n_limit))
        else:
            x = x.sample(n = n_limit, random_state=1).reset_index(drop=True)
            print('parameter update: downsample to %d' % (n_limit))
    scaler = StandardScaler()
    x = scaler.fit_transform(x)
    
    # instantiated
    methods = OrderedDict()
    methods['pca'] = PCA(n_components = n_components)
    methods['SE'] = SpectralEmbedding(n_components=n_components, n_neighbors=n_neighbors)
    methods['Isomap'] = Isomap(n_neighbors, n_components)
    methods['t-SNE'] = TSNE(n_components=n_components, init='pca', random_state=0)
    methods['MDS'] = MDS(n_components, max_iter=100, n_init=1)
    LLE = partial(LocallyLinearEmbedding, n_neighbors, n_components, eigen_solver='auto')
    methods['LLE'] = LLE(method='standard')

    # manifold learning
    Y = np.empty((len(methods), x.shape[0], n_components), dtype=float)
    t = np.empty((len(methods)), dtype=float)
    for i, (label, method) in enumerate(methods.items()):
        t0 = time()
        Y[i,] = method.fit_transform(x)
        t[i] = time() - t0
        print("%s: %.2g sec" % (label, t[i]))
        if label == 'pca':
            plt.bar(range(n_components), method.explained_variance_ratio_)
            plt.plot(range(n_components), np.cumsum(method.explained_variance_ratio_), 'o-')
            plt.title("Component-wise and Cumulative Explained Variance")
            
    # Create figure
    if y is not None:
        fig = plt.figure(figsize=figsize)
        fig.suptitle("Manifold Learning : (color = %s)" % (y.name), fontsize=14)
        for i, (label, method) in enumerate(methods.items()):
            ax = fig.add_subplot(1, len(methods), 1 + i)
            plt.scatter(Y[i,][:, 0], Y[i,][:, 1], c = np.array(y), 
                        marker='.', cmap=plt.cm.Spectral)
            plt.xlabel("%s" % (label))
            ax.xaxis.set_major_formatter(NullFormatter())
            ax.yaxis.set_major_formatter(NullFormatter())

        plt.show()
    else:
        fig = plt.figure(figsize=figsize)
        fig.suptitle("Manifold Learning",
                     fontsize=14)
        for i, (label, method) in enumerate(methods.items()):
            ax = fig.add_subplot(1, len(methods), 1 + i)
            plt.scatter(Y[i,][:, 0], Y[i,][:, 1], 
                        marker='.', cmap=plt.cm.Spectral)
            plt.xlabel("%s" % (label))
            ax.xaxis.set_major_formatter(NullFormatter())
            ax.yaxis.set_major_formatter(NullFormatter())
        plt.show()
    if first_nval is not None: 
        first_nval -= 1
        for j in range(len(colname_ls)):
            fig = plt.figure(figsize=figsize)
            fig.suptitle("Manifold Learning : (color = %s)" % (colname_ls.values[j]), fontsize=14)
            for i, (label, method) in enumerate(methods.items()):
                ax = fig.add_subplot(1, len(methods), 1 + i)
                plt.scatter(Y[i,][:, 0], Y[i,][:, 1], c = x[:,j], 
                            marker='.', cmap=plt.cm.Spectral)
                ax.xaxis.set_major_formatter(NullFormatter())
                ax.yaxis.set_major_formatter(NullFormatter())
            plt.show()
            if first_nval <= j:
                break
    return methods

Data generation

In [2]:
from sklearn.datasets import make_classification
x, y = make_classification(n_samples=3000, n_features = 10, n_classes = 3,
                           n_informative = 3, random_state=0)
In [3]:
x = pd.DataFrame(x, columns = [ 'col'+str(i) for i in range(x.shape[1]) ])
y = pd.Series(y, name = 'target')
In [4]:
x
Out[4]:
col0 col1 col2 col3 col4 col5 col6 col7 col8 col9
0 -0.246631 1.536473 1.878893 1.511736 -0.627751 -0.477678 1.544578 -1.383914 0.243388 -1.836949
1 0.347440 0.416746 1.177944 0.714926 -1.878313 -1.341999 1.502650 1.269238 0.582506 0.815452
2 0.726722 -0.491154 1.983053 2.067443 -1.382206 0.955374 1.556236 -0.962565 0.103122 -1.361854
3 0.943109 -0.768991 -1.226675 0.179224 0.576035 -0.594412 -0.622367 2.344729 0.163916 2.510684
4 0.558502 -1.396403 -0.356204 0.123369 -0.277405 -0.009907 -1.056255 -1.535977 -0.809389 -1.120022
... ... ... ... ... ... ... ... ... ... ...
2995 0.042244 -0.768635 -0.419798 0.978173 0.110063 0.828541 -0.433958 1.979497 -0.366332 2.250201
2996 -0.966736 0.150888 -0.964605 -0.335861 1.133318 0.664077 -1.059302 1.775004 -0.592055 2.240712
2997 -0.486020 -0.626297 0.361388 0.795056 0.347224 -0.464601 0.029766 -1.917474 -0.099174 -1.960272
2998 -0.639131 0.109291 -0.139325 -0.837591 0.634916 1.940414 -2.107619 -0.199481 -2.532102 1.002135
2999 -1.433774 -0.905081 -0.902083 -0.821010 0.564127 0.022440 -1.606352 -1.578345 -0.957006 -1.001295

3000 rows × 10 columns

In [5]:
y
Out[5]:
0       0
1       0
2       0
3       0
4       2
       ..
2995    2
2996    0
2997    1
2998    1
2999    2
Name: target, Length: 3000, dtype: int64

Output

In [6]:
manifold_models = vis_manifold(x = x, y = y, first_nval = 5)
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ipykernel_launcher.py:30: VisibleDeprecationWarning: `rank` is deprecated; use the `ndim` attribute or function instead. To find the rank of a matrix see `numpy.linalg.matrix_rank`.
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/utils/validation.py:70: FutureWarning: Pass n_neighbors=3, n_components=3 as keyword args. From version 0.25 passing these as positional arguments will result in an error
  FutureWarning)
parameter update: len(y.unique()) = 3 => n_neighbors
pca: 0.0047 sec
SE: 0.39 sec
Isomap: 3.5 sec
t-SNE: 16 sec
MDS: 16 sec
LLE: 0.79 sec