Function¶

from sklearn.decomposition import PCA
from sklearn.manifold import LocallyLinearEmbedding, Isomap, MDS, SpectralEmbedding, TSNE
import numpy as np
from collections import OrderedDict
from time import time
from functools import partial
from matplotlib.ticker import NullFormatter
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

def vis_manifold(x, y = None, first_nval = None, n_neighbors = 10, n_components = 3, n_limit = 10000):
    
    '''quick dimention reduction for panel data using sklearn.manifold functionality
    
    Attributes:
        x: dataframe
        y: (this is optional) series as target 
        first_nval: if y is None, then show 2d visualization colored by first n col in x
        n_neighbors: n_neighbors in manifold method
        n_limit: if x has more than n_limit datapoint, downsample to n_limit (stratify if y presents)
    '''
    # parameters
    figsize = (18, 3)
    colname_ls = x.columns
    
    # diagnosis
    if y is not None:
        if np.rank(y) != 1:
            raise TypeError('np.rank(y) != 1')
        if (x.shape[0] == len(y)) is not True:
            raise TypeError('Dimention dismatch: x.shape[0], len(y)')
        if len(y.unique()) <= n_neighbors:
            n_neighbors = len(y.unique())
            print('parameter update: len(y.unique()) = %d => n_neighbors' % (n_neighbors))
            
    # preprocessing
    if x.shape[0] > n_limit:
        if y is not None:
            x_index, _, y, _ = train_test_split(range(x.shape[0]), y, train_size = n_limit/ x.shape[0], 
                                                stratify = y, random_state = 42)
            y = (y - y.mean()) / y.std()
            x = x.iloc[x_index,:].reset_index(drop = True)
            print('parameter update: stratified downsample to %d' % (n_limit))
        else:
            x = x.sample(n = n_limit, random_state=1).reset_index(drop=True)
            print('parameter update: downsample to %d' % (n_limit))
    scaler = StandardScaler()
    x = scaler.fit_transform(x)
    
    # instantiated
    methods = OrderedDict()
    methods['pca'] = PCA(n_components = n_components)
    methods['SE'] = SpectralEmbedding(n_components=n_components, n_neighbors=n_neighbors)
    methods['Isomap'] = Isomap(n_neighbors, n_components)
    methods['t-SNE'] = TSNE(n_components=n_components, init='pca', random_state=0)
    methods['MDS'] = MDS(n_components, max_iter=100, n_init=1)
    LLE = partial(LocallyLinearEmbedding, n_neighbors, n_components, eigen_solver='auto')
    methods['LLE'] = LLE(method='standard')

    # manifold learning
    Y = np.empty((len(methods), x.shape[0], n_components), dtype=float)
    t = np.empty((len(methods)), dtype=float)
    for i, (label, method) in enumerate(methods.items()):
        t0 = time()
        Y[i,] = method.fit_transform(x)
        t[i] = time() - t0
        print("%s: %.2g sec" % (label, t[i]))
        if label == 'pca':
            plt.bar(range(n_components), method.explained_variance_ratio_)
            plt.plot(range(n_components), np.cumsum(method.explained_variance_ratio_), 'o-')
            plt.title("Component-wise and Cumulative Explained Variance")
            
    # Create figure
    if y is not None:
        fig = plt.figure(figsize=figsize)
        fig.suptitle("Manifold Learning : (color = %s)" % (y.name), fontsize=14)
        for i, (label, method) in enumerate(methods.items()):
            ax = fig.add_subplot(1, len(methods), 1 + i)
            plt.scatter(Y[i,][:, 0], Y[i,][:, 1], c = np.array(y), 
                        marker='.', cmap=plt.cm.Spectral)
            plt.xlabel("%s" % (label))
            ax.xaxis.set_major_formatter(NullFormatter())
            ax.yaxis.set_major_formatter(NullFormatter())

        plt.show()
    else:
        fig = plt.figure(figsize=figsize)
        fig.suptitle("Manifold Learning",
                     fontsize=14)
        for i, (label, method) in enumerate(methods.items()):
            ax = fig.add_subplot(1, len(methods), 1 + i)
            plt.scatter(Y[i,][:, 0], Y[i,][:, 1], 
                        marker='.', cmap=plt.cm.Spectral)
            plt.xlabel("%s" % (label))
            ax.xaxis.set_major_formatter(NullFormatter())
            ax.yaxis.set_major_formatter(NullFormatter())
        plt.show()
    if first_nval is not None: 
        first_nval -= 1
        for j in range(len(colname_ls)):
            fig = plt.figure(figsize=figsize)
            fig.suptitle("Manifold Learning : (color = %s)" % (colname_ls.values[j]), fontsize=14)
            for i, (label, method) in enumerate(methods.items()):
                ax = fig.add_subplot(1, len(methods), 1 + i)
                plt.scatter(Y[i,][:, 0], Y[i,][:, 1], c = x[:,j], 
                            marker='.', cmap=plt.cm.Spectral)
                ax.xaxis.set_major_formatter(NullFormatter())
                ax.yaxis.set_major_formatter(NullFormatter())
            plt.show()
            if first_nval <= j:
                break
    return methods

Data generation¶

from sklearn.datasets import make_classification
x, y = make_classification(n_samples=3000, n_features = 10, n_classes = 3,
                           n_informative = 3, random_state=0)

x = pd.DataFrame(x, columns = [ 'col'+str(i) for i in range(x.shape[1]) ])
y = pd.Series(y, name = 'target')

x

y

0       0
1       0
2       0
3       0
4       2
       ..
2995    2
2996    0
2997    1
2998    1
2999    2
Name: target, Length: 3000, dtype: int64

Output¶

manifold_models = vis_manifold(x = x, y = y, first_nval = 5)

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ipykernel_launcher.py:30: VisibleDeprecationWarning: `rank` is deprecated; use the `ndim` attribute or function instead. To find the rank of a matrix see `numpy.linalg.matrix_rank`.
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/utils/validation.py:70: FutureWarning: Pass n_neighbors=3, n_components=3 as keyword args. From version 0.25 passing these as positional arguments will result in an error
  FutureWarning)

parameter update: len(y.unique()) = 3 => n_neighbors
pca: 0.0047 sec
SE: 0.39 sec
Isomap: 3.5 sec
t-SNE: 16 sec
MDS: 16 sec
LLE: 0.79 sec

	col0	col1	col2	col3	col4	col5	col6	col7	col8	col9
0	-0.246631	1.536473	1.878893	1.511736	-0.627751	-0.477678	1.544578	-1.383914	0.243388	-1.836949
1	0.347440	0.416746	1.177944	0.714926	-1.878313	-1.341999	1.502650	1.269238	0.582506	0.815452
2	0.726722	-0.491154	1.983053	2.067443	-1.382206	0.955374	1.556236	-0.962565	0.103122	-1.361854
3	0.943109	-0.768991	-1.226675	0.179224	0.576035	-0.594412	-0.622367	2.344729	0.163916	2.510684
4	0.558502	-1.396403	-0.356204	0.123369	-0.277405	-0.009907	-1.056255	-1.535977	-0.809389	-1.120022
...	...	...	...	...	...	...	...	...	...	...
2995	0.042244	-0.768635	-0.419798	0.978173	0.110063	0.828541	-0.433958	1.979497	-0.366332	2.250201
2996	-0.966736	0.150888	-0.964605	-0.335861	1.133318	0.664077	-1.059302	1.775004	-0.592055	2.240712
2997	-0.486020	-0.626297	0.361388	0.795056	0.347224	-0.464601	0.029766	-1.917474	-0.099174	-1.960272
2998	-0.639131	0.109291	-0.139325	-0.837591	0.634916	1.940414	-2.107619	-0.199481	-2.532102	1.002135
2999	-1.433774	-0.905081	-0.902083	-0.821010	0.564127	0.022440	-1.606352	-1.578345	-0.957006	-1.001295

Note-Manifold Learning

Weiquan Luo

2021-04-25

Table of Contents

Function¶

Data generation¶

Output¶