-
Notifications
You must be signed in to change notification settings - Fork 0
/
optimizers.py
118 lines (89 loc) · 3.41 KB
/
optimizers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import numpy as np
from quantumbrain.graph import graph
class Optimizer:
def apply_gradients(self):
pass
class SDG(Optimizer):
def __init__(self, lr=0.01):
self.lr = lr
def apply_gradients(self):
for key in graph.params.keys():
graph.params[key] -= self.lr * graph.grads[key]
class Momentum(Optimizer):
def __init__(self, lr=0.001, momentum=0.9):
self.lr = lr
self.momentum = momentum
self.v = None
def apply_gradients(self):
if self.v is None:
self.v = {}
for key, val in graph.params.items():
self.v[key] = np.zeros_like(val)
for key in graph.params.keys():
self.v[key] = self.momentum * self.v[key] - self.lr * graph.grads[key]
graph.params[key] += self.v[key]
class Nesterov(Optimizer):
def __init__(self, lr=0.001, momentum=0.9):
self.lr = lr
self.momentum = momentum
self.v = None
def apply_gradients(self):
if self.v is None:
self.v = {}
for key, val in graph.params.items():
self.v[key] = np.zeros_like(val)
for key in graph.params.keys():
self.v[key] *= self.momentum
self.v[key] -= self.lr * graph.grads[key]
graph.params[key] += self.momentum * self.momentum * self.v[key]
graph.params[key] -= (1 + self.momentum) * self.lr * graph.grads[key]
class AdaGrad(Optimizer):
def __init__(self, lr=1e-4):
self.lr = lr
self.delta = 1e-7
self.h = None
def apply_gradients(self):
if self.h is None:
self.h = {}
for key, val in graph.params.items():
self.h[key] = np.zeros_like(val)
for key in graph.params.keys():
self.h[key] += graph.grads[key] * graph.grads[key]
graph.params[key] -= self.lr * graph.grads[key] / (np.sqrt(self.h[key] + self.delta))
class RMSprop(Optimizer):
def __init__(self, lr=0.001, rho=0.9):
self.lr = lr
self.rho = rho
self.delta = 1e-7
self.h = None
def apply_gradients(self):
if self.h is None:
self.h = {}
for key, val in graph.params.items():
self.h[key] = np.zeros_like(val)
for key in graph.params.keys():
self.h[key] *= self.rho
self.h[key] += (1 - self.rho) * graph.grads[key] * graph.grads[key]
graph.params[key] -= self.lr * graph.grads[key] / (np.sqrt(self.h[key] + self.delta))
class Adam(Optimizer):
def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999):
self.lr = lr
self.beta_1 = beta_1
self.beta_2 = beta_2
self.delta = 1e-7
self.m = None
self.v = None
self.t = 0
def apply_gradients(self):
if self.m is None:
self.m = {}
self.v = {}
for key, val in graph.params.items():
self.m[key] = np.zeros_like(val)
self.v[key] = np.zeros_like(val)
self.t += 1
lr = self.lr * np.sqrt(1.0 - self.beta_2**self.t) / (1.0 - self.beta_1**self.t)
for key in graph.params.keys():
self.m[key] += (1.0 - self.beta_1) * (graph.grads[key] - self.m[key])
self.v[key] += (1.0 - self.beta_2) * (graph.grads[key]**2 - self.v[key])
graph.params[key] -= lr * self.m[key] / (np.sqrt(self.v[key]) + self.delta)