The performance of ensemble model is determined by variaty:
Source of variaty | why work | Example |
---|---|---|
input feature | increase input variaty in each learner | bootstrap, Cross-Validation |
Model heterogeneity | Different models consider different aspect for decision boundary. | parametric(linear Regression, logisitic regression, linear SVM, artificial neural network) and non-parametric (K-nearest neighbor, support vector machine, Tree-based model, RBF kernel SVMs) |
hyper-parameter | improve variaty of homogeneous model | i.e. depth of tree/NN, |
Method | Why works | How to ensemble | Pros and Cons | Representive algorithm | Architecture |
---|---|---|---|---|---|
Uniform Blending | law of large numbers | Expanding feature space | Combining (voting/averaging weight) multiple base learner that use different metric, i.e. different loss in SVM or DNN | Pros: Model heterogeneity improve model generalization Cons: fixed weight on each learner |
(commonly used in below methods during aggregating) |
Linear Blending | feature extraction | Aggregating multiple strong heterogeneous learners as feature extractor to train meta-learn | Pros: explicit feed higher-level crossed features to the meta learner Cons: different data subset in two-level of learners. |
||
stacking (Any Blending) | improve data usage | Cross-Validation | |||
Boosting | Additive Model reduce bias | Aggregating multiple weak homogeneous base learners that have low variance and high bias (underfitted) | Pros: each sample weighted differently to better distinguish labels Cons: sequential training for each model, where parallel is avaliable with some tricks |
adaboost, gbdt, xgboost, lightbgm | |
Bagging | Bootstrap reduce variance, randomly data subset create multiple starting points, and avoid stick at local max/min | Aggregating multiple strong homogeneous base learners that have high variance and low bias (overfitted) | Pros: independent learner allow parallel trainning Cons: easy and hard samples weight the same. |
random forest |
reference: Scikit-Learn Ensemble methods
import warnings
warnings.filterwarnings('ignore')
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
digits = load_digits()
X,y = digits.data, digits.target
feature_name = digits.feature_names
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print('X_train:',X_train.shape, '\n'
'y_train:',y_train.shape, '\n'
'X_test:',X_test.shape, '\n'
'y_test:',y_test.shape)
# for testing a base classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
model = DecisionTreeClassifier(max_depth=5).fit(X_train, y_train)
print('f1_score:',f1_score(y_test, model.predict(X_test), average='micro'))
# for testing a base regressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import r2_score
model=DecisionTreeRegressor(max_depth=5).fit(X_train,y_train)
print('r2_score:',r2_score(y_test, model.predict(X_test)))
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
clf1 = DecisionTreeClassifier(max_depth=5)
clf2 = LogisticRegression()
model = VotingClassifier(estimators=[('dt', clf1), ('lr', clf2)], weights=[1,1]).fit(X_train, y_train)
print('f1_score:',f1_score(y_test, model.predict(X_test), average='micro'))
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
reg1 = LinearRegression()
reg2 = RandomForestRegressor(n_estimators=10, random_state=1)
model = VotingRegressor(estimators=[('lr', reg1), ('rf', reg2)],weights=[1,1]).fit(X_train, y_train)
print('r2_score:',r2_score(y_test, model.predict(X_test)))
# Classifier
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
clfs= [RandomForestClassifier(),
LogisticRegression(),
LGBMClassifier()]
X_d1,X_d2,y_d1,y_d2 = train_test_split(X_train,y_train,test_size = 0.5,random_state=2020)
dataset_d2 = np.zeros((X_d2.shape[0],len(clfs)))
dataset_test = np.zeros((X_test.shape[0],len(clfs)))
for j,clf in enumerate(clfs):
model=clf.fit(X_d1,y_d1)
dataset_d2[:,j] = clf.predict_proba(X_d2)[:,1]
dataset_test[:,j] = clf.predict_proba(X_test)[:,1]
print('f1_score:',f1_score(y_test, model.predict(X_test), average='micro'), " ",clf)
model = GradientBoostingClassifier().fit(dataset_d2,y_d2)
print('f1_score:',f1_score(y_test, model.predict(dataset_test), average='micro'), ' Linear blending using', model)
# Regressor
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor
clfs= [RandomForestRegressor(),
LinearRegression(),
LGBMRegressor()]
X_d1,X_d2,y_d1,y_d2 = train_test_split(X_train,y_train,test_size = 0.5,random_state=2020)
dataset_d2 = np.zeros((X_d2.shape[0],len(clfs)))
dataset_test = np.zeros((X_test.shape[0],len(clfs)))
for j,clf in enumerate(clfs):
model=clf.fit(X_d1,y_d1)
dataset_d2[:,j] = clf.predict(X_d2)
dataset_test[:,j] = clf.predict(X_test)
print('r2_score:',r2_score(y_test, model.predict(X_test)), " ",clf)
model = GradientBoostingRegressor().fit(dataset_d2,y_d2)
print('r2_score:',r2_score(y_test, model.predict(dataset_test)), ' Linear blending using', model)
# Classifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
from mlxtend.classifier import StackingCVClassifier
clfs= [RandomForestClassifier(),
LogisticRegression(),
LGBMClassifier()]
for clf in clfs:
clf = clf.fit(X_train, y_train)
print('f1_score:',f1_score(y_test, clf.predict(X_test), average='micro'), clf)
model = StackingCVClassifier(classifiers=clfs, meta_classifier=GradientBoostingClassifier(), cv=5).fit(X_train, y_train)
print('f1_score:',f1_score(y_test, model.predict(X_test), average='micro'), ' stacking using', model)
# Regressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor
from mlxtend.regressor import StackingCVRegressor
regs= [RandomForestRegressor(),
LinearRegression(),
LGBMRegressor()]
for reg in regs:
reg = reg.fit(X_train, y_train)
print('r2_score:',r2_score(y_test, reg.predict(X_test)), reg)
model = StackingCVRegressor(regressors=regs, meta_regressor=GradientBoostingRegressor(), cv=5).fit(X_train, y_train)
print('r2_score:',r2_score(y_test, model.predict(X_test)), ' stacking using', model)
# Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
model1=DecisionTreeClassifier(max_depth=5)
model=BaggingClassifier(model1,n_estimators=100,max_samples=0.3).fit(X_train,y_train)
print('f1_score:',f1_score(y_test, model.predict(X_test), average='micro'))
# Regressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
baseModel= DecisionTreeRegressor(max_depth=5)
model = BaggingRegressor(baseModel, n_estimators=10, random_state=0).fit(X_train,y_train)
print('r2_score:',r2_score(y_test, model.predict(X_test)))
# Classifier
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier().fit(X_train, y_train)
print('f1_score:',f1_score(y_test, model.predict(X_test), average='micro'))
# Regressor
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor().fit(X_train, y_train)
print('r2_score:',r2_score(y_test, model.predict(X_test)))
Algorithm | Method | Hightlight | Weakness |
---|---|---|---|
adaboost, Adaptive Boosting | 1) Initialized a weak estimator; 2) give higher weight on the mis-classified sampless; 3) iterate the processses until the condition is satisfied. | 1) focus on the difficult sample with more weight during trainning; 2) essemble all estimator by weighted voting according to the performance | 1) sequatial models unsupport parallel trainning |
bdt, Boosting decision tree | 1) Initialized a weak estimator; 2) train the next weak estimator on the residual of the essemble model of the previous weak estimators; 3) iterate the processses until the condition is satisfied | 1) the loss is a function of residual from previous | 1) sequatial models unsupport parallel trainning |
gbdt, Gradient Boosting decision tree | Similar to bdt. 1) train each weak estimator on the residual of the essemble model of the previous weak estimators with the negative gradient of the loss function. | 1) the negative gradient of the loss helps to converge faster | 1) sequatial models unsupport parallel trainning |
Xgboost | Similar to gbdt. 1) reduce the complexity with Taylor's method that Approximates the objective function; 2) control the model complexity by adding regularization term into the objective function; 3) add a learning rate hyperparameter named Shrinkage to avoid overfitting | High accuracy with precise split point. 1) calculate the gain on each split point on each feature with exact greedy (pre-sorted) algorithm with $O(Hight \times feature \times samples \times log(samples))$, where ordering $samples \times log(samples)$). 2) CPU Cach optimization: when data is too large to load into memory, discrinize each feature by the weighted percentile (weighted by second derivated) sketch with approximate algorithm, then the exact greedy algorithm; reduce ordering task complexity by storing data with sparse CSC format in BLOCK and skip ordering in each split, from $O(Hight \times feature \times nlogn)$ to $O(Hight \times feature \times samples + samples \times log(samples))$ 3) enabled parallel query of block from different disks. 4) when data is too large to read in memory, out-of-core computation that computation and read disk at the same time is enable by block compression and block sharing; 5) Handle missing value with sparse-aware split fitting | 1) To obtain the precise split point, the ordering task in exact greedy algorithm optimized with the approximate algorithm is computational and memory expansive that sorting is required in each feature for the best split point. 2) Cache hit rate is low becauses the gradient is non-continuously stored in BLOCK |
Lightbgm | similar to Xgboost. 1) instead of level-wise in xgboost, use leaf-wise with max depth limitation; 2) instead of the pre-sorted algorithm $O(samples \times features)$ in xgboost, use histogram algorithm to lower the calculation complexity of split gain $o(bins \times features)$, by sorting only once for the whole data instead of once for each weak estimator and obtainning histogram of the brother branch by substraction between a branch and the parent branch; | Fast speed and low memory usage with small reduction in accuracy. 1) useing histogram algorithm to reduce complexity (contineously storing gradient info, reduce memory uses, sort only once) and get 8X smaller in memory cost where feature by sample by 1Bytes VS 2 by feature by sample by 4Bytes; 2) improve the histrogram algorithm with Gradient-basse One-side Sampling (GOSS) that randomly samples the well-trained (low loss that result small gradient) data; 3) combine the multually exclusive feature with Exclusive feature bundling algorithm (EFB) to reduce the computation complexity from $O(sampleS \times features)$ to $O(samples \times bundle)$. where Greedy bundle algorithm use to determine what to combine and Merge Eclusive feature algorithm to determine how to combine; 4) ache-line aware optimazation with PV-tree (include features parallel, samples parallel, voting parallel) get 40% speed-up on higgs data. 5) categorical feature support gain 4x speed-up onexpo data | 1) The split point is approximate and result a few but barely reduction in accuracy. |
# Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
weakClassifier=DecisionTreeClassifier()
model=AdaBoostClassifier(base_estimator=weakClassifier).fit(X, y)
print('f1_score:',f1_score(y_test, model.predict(X_test), average='micro'))
# Regressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
weakClassifier = DecisionTreeRegressor()
model=AdaBoostRegressor(base_estimator=weakClassifier).fit(X, y)
print('r2_score:',r2_score(y_test, model.predict(X_test)))
# Classifier
from xgboost import XGBClassifier
model = XGBClassifier().fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
eval_metric = 'mlogloss',verbose=False)
print('f1_score:',f1_score(y_test, model.predict(X_test), average='micro'))
# Regressor
from xgboost import XGBRegressor
model = XGBRegressor().fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
eval_metric='rmse',verbose=False)
print('r2_score:',r2_score(y_test, model.predict(X_test)))
# Classifier
from lightgbm import LGBMClassifier
model = LGBMClassifier().fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
eval_metric='logloss',verbose=False)
print('f1_score:',f1_score(y_test, model.predict(X_test), average='micro'))
# Regressor
from lightgbm import LGBMRegressor
model = LGBMRegressor().fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
eval_metric='logloss',verbose=False)
print('r2_score:',r2_score(y_test, model.predict(X_test)))