This is a comparsion of classic gradient descent algorithm in the three setups for batch size.
Evaluation | batch_gradient_descent | stochastic_gradient_descent | mini_batch_gradient_descent |
---|---|---|---|
number of sample in each epoch | all samples | one sample | subset of all samples |
Learning rate | small leraning rate is require to aviod gradient vanishing problem | allow larger learning rate, but not steadily to the global minial due to the noise of single datapoint | stably update toward global minial with a larger learning rate by normalizing the noise using batch data |
Stablility | always update toward the global minial along the steepest direction | update toward the global minial but with many noise | update toward the global minial with little noise |
Capacity of high-dimention data | slow | fast | moderate |
If global optimal | Yes | Usually No | Almost Yes |
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib
from matplotlib import ticker, cm
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from numpy import linalg as LA
import time
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead. import pandas.util.testing as tm
mu, sigma = 0, 20
size = 500
# design matrix
x_1 = np.random.normal(loc=mu, scale=sigma, size=size)
# x_2 = np.random.normal(loc=mu, scale=sigma, size=size)*x_1
# # x_1 = np.linspace(-20, 20, size)
# X = np.column_stack((x_1,x_2))
X = sm.add_constant(x_1)
# parameter
beta = np.array([5, 5])
# error
e = np.random.normal(loc=mu, scale=sigma, size=size)
# dependent variable
y = np.dot(X, beta) + e
d = {'x_1': x_1, 'y': y, 'e': e}
df = pd.DataFrame(data = d)
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x7fc567891390>
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())
# https://www.statsmodels.org/stable/examples/notebooks/generated/ols.html
OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.958 Model: OLS Adj. R-squared: 0.958 Method: Least Squares F-statistic: 1.132e+04 Date: Wed, 03 Mar 2021 Prob (F-statistic): 0.00 Time: 15:32:16 Log-Likelihood: -2225.9 No. Observations: 500 AIC: 4456. Df Residuals: 498 BIC: 4464. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const 5.5058 0.931 5.916 0.000 3.677 7.334 x1 4.9666 0.047 106.379 0.000 4.875 5.058 ============================================================================== Omnibus: 0.356 Durbin-Watson: 1.841 Prob(Omnibus): 0.837 Jarque-Bera (JB): 0.465 Skew: 0.043 Prob(JB): 0.793 Kurtosis: 2.878 Cond. No. 19.9 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
num = 15
v = np.linspace(-50, 50, num = num, endpoint = True)
beta_0, beta_1 = np.meshgrid(v,v)
print('length of theta', len(beta_0.flatten()), len(beta_1.flatten()))
length of theta 225 225
MSE_grid = []
for a,b in zip(beta_0.flatten(), beta_1.flatten()):
y_pred = np.dot(X, [a,b])
MSE_grid.append(np.sum(((y_pred - y)**2)) / size)
plt.contourf(beta_0,beta_1, np.reshape(MSE_grid, [num,num]), locator=ticker.LogLocator())
plt.xlabel('beta_0')
plt.ylabel('beta_1')
plt.title('MSE on 2D Parameter space')
plt.colorbar()
<matplotlib.colorbar.Colorbar at 0x7fc568f40cd0>
def batch_gradient_descent(X, y, beta, lr =0.005, epoch=8000, early_stop = None):
'''
Batch Gradient Descent (metrics: MSE)
X: design matrix
y: target
beta: initial parameter
lr: learning rate, deflaut = 0.005,
epoch: number of iteration, deflaut = 8000
early_stop: number of consecutive greater norm of the gradient
'''
t0 = time.time()
n = len(y)
# initial log
log_update, log_gradient, log_beta, log_mse, log_time = [], [], [], [], []
log_update.append(0)
log_time.append(0)
es_count = 0
y_pred = np.dot(X, beta)
gradient = -2 * np.dot(X.transpose(), y - y_pred) /n
log_gradient.append(gradient)
log_beta.append(beta)
mse = np.sum(((y_pred - y)**2)) / n
log_mse.append(np.sum(((y_pred - y)**2)) / n)
for i in range(epoch):
y_pred = np.dot(X, beta) # all samples are used
gradient = -2 * np.dot(X.transpose(), y - y_pred) /n
update = lr * gradient
beta = beta - update
# log
mse = np.sum(((y_pred - y)**2)) / n
# mse early stop
if early_stop is not None:
if mse > log_mse[-1]:
es_count += 1
if es_count > early_stop:
break
log_time.append(time.time() - t0)
log_gradient.append(gradient)
log_update.append(update)
log_beta.append(beta)
log_mse.append(mse)
return i, log_gradient, log_update, log_beta, log_mse, log_time
i, log_gradient, log_update, log_beta, log_mse, log_time = batch_gradient_descent(X, y, lr =0.002, beta = [-45, -45])
i, log_gradient[-1], log_update[-1], log_beta[-1], log_mse[-1]
(7999, array([-1.26858879e-12, 8.32187652e-14]), array([-2.53717758e-15, 1.66437530e-16]), array([5.50584248, 4.96656005]), 430.75092932386497)
def stochastic_gradient_descent(X, y, beta, lr =0.005, epoch=8000, early_stop=None):
'''
Stochastic Gradient Descent (metrics: MSE)
X: design matrix
y: target
beta: initial parameter
lr: learning rate, deflaut = 0.005,
epoch: number of iteration, deflaut = 8000
early_stop: number of consecutive greater norm of the gradient
'''
t0 = time.time()
n = len(y)
# initial log
log_update, log_gradient, log_beta, log_mse, log_time = [], [], [], [], []
log_update.append(0)
log_time.append(0)
es_count = 0
y_pred = np.dot(X, beta)
gradient = -2 * np.dot(X.transpose(), y - y_pred) /n
log_gradient.append(gradient)
log_beta.append(beta)
mse = np.sum(((y_pred - y)**2)) / n
log_mse.append(np.sum(((y_pred - y)**2)) / n)
for i in range(epoch):
update_index = np.random.randint(n) # ramdom sample for one sample
y_pred = np.dot(X[update_index], beta)
gradient = -2 * np.dot(X[update_index].transpose(), y[update_index] - y_pred) /n
update = lr * gradient
beta = beta - update
# log
y_pred = np.dot(X, beta)
mse = np.sum(((y_pred - y)**2)) / n
# mse early stop
if early_stop is not None:
if mse > log_mse[-1]:
es_count += 1
if es_count > early_stop:
break
log_time.append(time.time() - t0)
log_gradient.append(gradient)
log_update.append(update)
log_beta.append(beta)
log_mse.append(mse)
return i, log_gradient, log_update, log_beta, log_mse, log_time
i, log_gradient, log_update, log_beta, log_mse, log_time = stochastic_gradient_descent(X, y, lr =0.01, beta = [-45, -45])
i, log_gradient[-1], log_update[-1], log_beta[-1], log_mse[-1], log_time[-1]
(7999, array([-0.2689755 , 14.69461793]), array([-0.00268976, 0.14694618]), array([-31.36458028, 4.71979165]), 1800.6214664464098, 0.27544617652893066)
def mini_batch_gradient_descent(X, y, beta, batch_size = 50, lr =0.005, epoch=8000, early_stop=None):
'''
mini batch Gradient Descent (metrics: MSE)
X: design matrix
y: target
beta: initial parameter
lr: learning rate, deflaut = 0.005,
batch_size: the number of data point used to update the parameter
epoch: number of iteration, deflaut = 8000
early_stop: number of consecutive greater norm of the gradient
'''
t0 = time.time()
n = len(y)
# initial log
log_update, log_gradient, log_beta, log_mse, log_time = [], [], [], [], []
log_update.append(0)
log_time.append(0)
es_count = 0
y_pred = np.dot(X, beta)
gradient = -2 * np.dot(X.transpose(), y - y_pred) /n
log_gradient.append(gradient)
log_beta.append(beta)
mse = np.sum(((y_pred - y)**2)) / n
log_mse.append(np.sum(((y_pred - y)**2)) / n)
for i in range(epoch):
update_index = np.random.randint(n, size = batch_size) # ramdom sample for [batch_size] sample
y_pred = np.dot(X[update_index], beta)
gradient = -2 * np.dot(X[update_index].transpose(), y[update_index] - y_pred) /n
update = lr * gradient
beta = beta - update
# log
y_pred = np.dot(X, beta)
mse = np.sum(((y_pred - y)**2)) / n
# mse early stop
if early_stop is not None:
if mse > log_mse[-1]:
es_count += 1
if es_count > early_stop:
break
log_time.append(time.time() - t0)
log_gradient.append(gradient)
log_update.append(update)
log_beta.append(beta)
log_mse.append(mse)
return i, log_gradient, log_update, log_beta, log_mse, log_time
i, log_gradient, log_update, log_beta, log_mse, log_time = mini_batch_gradient_descent(X, y, lr =0.01, beta = [-45, -45])
i, log_gradient[-1], log_update[-1], log_beta[-1], log_mse[-1], log_time[-1]
(7999, array([-0.43111724, -3.63384885]), array([-0.00431117, -0.03633849]), array([5.56704379, 5.02759389]), 432.22938653220774, 0.3653557300567627)
data = {'X': X,
'y': y,
'lr': 0.002,
'beta': [-30, -30],
'epoch': 20000}
gradient_descent_algorithms = (
('batch_gradient_descent', batch_gradient_descent),
('stochastic_gradient_descent', stochastic_gradient_descent),
('mini_batch_gradient_descent', mini_batch_gradient_descent)
)
algorithms_result = []
for name, algorithm in gradient_descent_algorithms:
algorithm_result = algorithm(**data)
algorithms_result.append(algorithm_result)
fig, axs = plt.subplots(1, 3, figsize=(15,5), sharey=True)
for i in range(len(algorithms_result)):
log_beta = algorithms_result[i][3]
log_beta_vstack = np.vstack(log_beta)
log_time_total = algorithms_result[i][5][-1]
axs[i].contourf(beta_0,beta_1, np.reshape(MSE_grid, [num,num]), locator=ticker.LogLocator())
axs[i].scatter(x = log_beta_vstack[:,0], y = log_beta_vstack[:,1], color = 'red')
axs[i].plot([5], [5], "*", color = 'yellow')
axs[i].text(.99, .01, ('%.2fs' % (log_time_total)).lstrip('0'),
size=15,
horizontalalignment='right',
verticalalignment='bottom',
transform=axs[i].transAxes)
axs[i].set_title(gradient_descent_algorithms[i][0])
axs[i].set_xlabel('beta_0')
axs[i].set_ylabel('beta_1')
fig, axs = plt.subplots(1, 3, figsize=(15,5), sharey=True, sharex=True)
for i in range(len(algorithms_result)):
log_gradient = algorithms_result[i][1]
log_gradient_norm = [LA.norm(a)/100 for a in log_gradient]
log_time = algorithms_result[i][5]
axs[i].plot(log_time, log_gradient_norm)
axs[i].set_yscale('log')
axs[i].set_xscale('log')
axs[i].set_title(gradient_descent_algorithms[i][0])
axs[i].set_ylabel('log_gradient_norm on parameter space')
axs[i].set_xlabel('time')
fig, axs = plt.subplots(1, 3, figsize=(15,5), sharey=True, sharex=True)
for i in range(len(algorithms_result)):
log_mse = algorithms_result[i][4]
log_time = algorithms_result[i][5]
axs[i].plot(log_time, log_mse)
axs[i].set_yscale('log')
axs[i].set_xscale('log')
axs[i].set_title(gradient_descent_algorithms[i][0])
axs[i].set_ylabel('mse loss')
axs[i].set_xlabel('time')
fig, axs = plt.subplots(1, 3, figsize=(15,5), sharey=True, sharex=True)
for i in range(len(algorithms_result)):
log_update = algorithms_result[i][2]
log_update_norm = [LA.norm(a)/100 for a in log_update]
log_time = algorithms_result[i][5]
axs[i].plot(log_time, log_update_norm)
axs[i].set_yscale('log')
axs[i].set_xscale('log')
axs[i].set_title(gradient_descent_algorithms[i][0])
axs[i].set_ylabel('log_update_norm on parameter')
axs[i].set_xlabel('time')