-
Notifications
You must be signed in to change notification settings - Fork 0
/
optim.py
234 lines (183 loc) · 7.56 KB
/
optim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
import numpy as np
"""
This code was originally written for CS 231n at Stanford University
(cs231n.stanford.edu). It has been modified in various areas for use in the
ECE 239AS class at UCLA. This includes the descriptions of what code to
implement as well as some slight potential changes in variable names to be
consistent with class nomenclature. We thank Justin Johnson & Serena Yeung for
permission to use this code. To see the original version, please visit
cs231n.stanford.edu.
"""
"""
This file implements various first-order update rules that are commonly used for
training neural networks. Each update rule accepts current weights and the
gradient of the loss with respect to those weights and produces the next set of
weights. Each update rule has the same interface:
def update(w, dw, config=None):
Inputs:
- w: A numpy array giving the current weights.
- dw: A numpy array of the same shape as w giving the gradient of the
loss with respect to w.
- config: A dictionary containing hyperparameter values such as learning rate,
momentum, etc. If the update rule requires caching values over many
iterations, then config will also hold these cached values.
Returns:
- next_w: The next point after the update.
- config: The config dictionary to be passed to the next iteration of the
update rule.
NOTE: For most update rules, the default learning rate will probably not perform
well; however the default values of the other hyperparameters should work well
for a variety of different problems.
For efficiency, update rules may perform in-place updates, mutating w and
setting next_w equal to w.
"""
def sgd(w, dw, config=None):
"""
Performs vanilla stochastic gradient descent.
config format:
- learning_rate: Scalar learning rate.
"""
if config is None: config = {}
config.setdefault('learning_rate', 1e-2)
w -= config['learning_rate'] * dw
return w, config
def sgd_momentum(w, dw, config=None):
"""
Performs stochastic gradient descent with momentum.
config format:
- learning_rate: Scalar learning rate.
- momentum: Scalar between 0 and 1 giving the momentum value.
Setting momentum = 0 reduces to sgd.
- velocity: A numpy array of the same shape as w and dw used to store a moving
average of the gradients.
"""
if config is None: config = {}
config.setdefault('learning_rate', 1e-2)
config.setdefault('momentum', 0.9) # set momentum to 0.9 if it wasn't there
v = config.get('velocity', np.zeros_like(w)) # gets velocity, else sets it to zero.
# ================================================================ #
# YOUR CODE HERE:
# Implement the momentum update formula. Return the updated weights
# as next_w, and the updated velocity as v.
# ================================================================ #
a = config['momentum']
e = config['learning_rate']
v = a*v - e*dw
next_w = w + v
# ================================================================ #
# END YOUR CODE HERE
# ================================================================ #
config['velocity'] = v
return next_w, config
def sgd_nesterov_momentum(w, dw, config=None):
"""
Performs stochastic gradient descent with Nesterov momentum.
config format:
- learning_rate: Scalar learning rate.
- momentum: Scalar between 0 and 1 giving the momentum value.
Setting momentum = 0 reduces to sgd.
- velocity: A numpy array of the same shape as w and dw used to store a moving
average of the gradients.
"""
if config is None: config = {}
config.setdefault('learning_rate', 1e-2)
config.setdefault('momentum', 0.9) # set momentum to 0.9 if it wasn't there
v = config.get('velocity', np.zeros_like(w)) # gets velocity, else sets it to zero.
# ================================================================ #
# YOUR CODE HERE:
# Implement the momentum update formula. Return the updated weights
# as next_w, and the updated velocity as v.
# ================================================================ #
a = config['momentum']
e = config['learning_rate']
vo = v
v = a*v - e*dw
next_w = w + v+a*(v-vo)
# ================================================================ #
# END YOUR CODE HERE
# ================================================================ #
config['velocity'] = v
return next_w, config
def rmsprop(w, dw, config=None):
"""
Uses the RMSProp update rule, which uses a moving average of squared gradient
values to set adaptive per-parameter learning rates.
config format:
- learning_rate: Scalar learning rate.
- decay_rate: Scalar between 0 and 1 giving the decay rate for the squared
gradient cache.
- epsilon: Small scalar used for smoothing to avoid dividing by zero.
- beta: Moving average of second moments of gradients.
"""
if config is None: config = {}
config.setdefault('learning_rate', 1e-2)
config.setdefault('decay_rate', 0.99)
config.setdefault('epsilon', 1e-8)
config.setdefault('a', np.zeros_like(w))
next_w = None
# ================================================================ #
# YOUR CODE HERE:
# Implement RMSProp. Store the next value of w as next_w. You need
# to also store in config['a'] the moving average of the second
# moment gradients, so they can be used for future gradients. Concretely,
# config['a'] corresponds to "a" in the lecture notes.
# ================================================================ #
lr = config['learning_rate']
b = config['decay_rate']
ep = config['epsilon']
a = config['a']
a = b*a+(1-b)*(dw**2)
next_w = w-(lr*dw)/(ep+np.sqrt(a))
config['a'] = a
# ================================================================ #
# END YOUR CODE HERE
# ================================================================ #
return next_w, config
def adam(w, dw, config=None):
"""
Uses the Adam update rule, which incorporates moving averages of both the
gradient and its square and a bias correction term.
config format:
- learning_rate: Scalar learning rate.
- beta1: Decay rate for moving average of first moment of gradient.
- beta2: Decay rate for moving average of second moment of gradient.
- epsilon: Small scalar used for smoothing to avoid dividing by zero.
- m: Moving average of gradient.
- v: Moving average of squared gradient.
- t: Iteration number.
"""
if config is None: config = {}
config.setdefault('learning_rate', 1e-3)
config.setdefault('beta1', 0.9)
config.setdefault('beta2', 0.999)
config.setdefault('epsilon', 1e-8)
config.setdefault('v', np.zeros_like(w))
config.setdefault('a', np.zeros_like(w))
config.setdefault('t', 0)
next_w = None
# ================================================================ #
# YOUR CODE HERE:
# Implement Adam. Store the next value of w as next_w. You need
# to also store in config['a'] the moving average of the second
# moment gradients, and in config['v'] the moving average of the
# first moments. Finally, store in config['t'] the increasing time.
# ================================================================ #
lr = config['learning_rate']
b1 = config['beta1']
b2 = config['beta2']
ep = config['epsilon']
v = config['v']
a = config['a']
t = config['t']
t+=1
v = b1*v+(1-b1)*dw
a = b2*a+(1-b2)*(dw**2)
v2 = v/(1-b1**t)
a2 = a/(1-b2**t)
next_w = w-lr*v2/(np.sqrt(a2)+ep)
config['a'] = a
config['v'] = v
# ================================================================ #
# END YOUR CODE HERE
# ================================================================ #
return next_w, config