This is a study note for understanding the hyper-parameter (p1, p2) in Adam optimizer (adaptive momentum optimizer)
For the weight of each newly-added term, $k$, in the iteration, $i$:
$$\text{weight}_{i,k} = p^{i-k-1} (1-p) dv^2$$
where:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
i_teration, k_dv = 10,10
p = 0.5 # coeficient of moment: % of past dv contribute tho the current
dv = 100 # assume nth dv are equvalent
arr_m_weight = np.zeros((i_teration+1, k_dv+1))
for i in range(i_teration+1):
for k in range(i):
# Exponentially Weighted Moving Averages of nth moment
arr_m_weight[i,k] = p**(i-k-1)*(1-p) * dv**2
df_m_weight = pd.DataFrame(arr_m_weight[1:, :k_dv])
df_m_weight.columns =['nth-moment_' + k for k in list(df_m_weight.columns.values.astype(str))]
# plot
f = plt.figure(figsize = (7,5))
plt.title('Weight of nth-moment over iteration', color='black')
df_m_weight.plot.bar(stacked=True, ax=f.gca(),
xlabel = 'iteration',
ylabel = 'weight in %')
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.show()
$\text{adjustment} = \frac{\sqrt{1-p_2^{i+1}}}{1-p_1^{i+1}} $
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm, Normalize
import os
import imageio
filenames = []
for i in range(16):
# plot
p = np.linspace(0, 1-1/20, 20)
df_p = pd.DataFrame({'p1': np.repeat(p, 20), 'p2': np.tile(p, 20)}).round(3)
df_p['weight'] = np.sqrt(1-df_p.p2**(i+1)) / (1-df_p.p1**(i+1))
df_p_wide = df_p.pivot("p1", "p2", "weight")
f = plt.figure(figsize = (15,15))
title = 'Adam initiation adjustment on m1/sqrt(m2), at iteration = ' + str(i)
plt.title(title, color='black')
g = sns.heatmap(df_p_wide,
cmap=plt.cm.gist_ncar, vmin = 0, vmax =10, cbar_kws={"shrink": .82},
annot = True, square=True) # xticklabels = 20, yticklabels =20, gist_ncar, tab20, norm=LogNorm()
g.invert_yaxis()
# create file name and append it to a list
filename = f'./tmp/{i}.png'
filenames.append(filename)
plt.tight_layout()
# save frame
plt.savefig(filename)
plt.close()
# build gif
with imageio.get_writer('./img/Adam initiation adjustment.gif', mode='I') as writer:
for filename in filenames:
image = imageio.imread(filename)
for i in range(5):
writer.append_data(image)
# Remove files
for filename in set(filenames):
os.remove(filename)
Adjustment on gradient at initiation | relation p1 and p2 |
---|---|
Amplifiy | p1 > p2 |
Diminish | p1 < p2 |
Constant | p1 = p2 |
p1, p2 = 0.5, 0.5
df_adj = pd.DataFrame({'i': np.arange(20),
'p1': np.repeat(p1, 20),
'p2': np.repeat(p2, 20)})
df_adj['adjustment'] = np.sqrt(1-df_adj.p2**(df_adj.i+1)) / (1-df_adj.p1**(df_adj.i+1))
title = 'Initiation adjustment along iteration, when P1=' + str(p2) +', p2='+ str(p2)
fig, ax = plt.subplots()
df_adj.plot.scatter(x='i', y='adjustment',
title=title,
figsize=(15,5), ax=ax)
for i in df_adj.i.values:
plt.annotate(df_adj.adjustment[i].round(3), (df_adj.i[i]+0.01, df_adj.adjustment[i]+0.005) )