Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Compatibility with Keras 2.2.3 and Tensorflow 1.11.0 #13

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
File renamed without changes.
File renamed without changes.
6 changes: 6 additions & 0 deletions keras_2/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@

# Weight Normalization using Keras

Example code for using Weight Normalization using [Keras](https://keras.io).

```cifar10_cnn.py``` contains the standard CIFAR-10 example from Keras, with lines 64 and 69 edited to include weight normalization and data dependent initialization.
145 changes: 145 additions & 0 deletions keras_2/cifar10_cnn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
'''
CIFAR-10 example from https://github.com/keras-team/keras/blob/master/examples/cifar10_cnn.py
Now with weight normalization. Lines 64-65 and 78-79 contain the changes w.r.t. original.
'''

from __future__ import print_function
import keras
from keras.datasets import cifar10
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
import os

'''Train a simple deep CNN on the CIFAR10 small images dataset.

It gets to 75% validation accuracy in 25 epochs, and 79% after 50 epochs.
(it's still underfitting at that point, though).

With weight normalization, a validation accuracy of 75% is already reached
after 10 epochs.
'''

batch_size = 32
num_classes = 10
epochs = 100
data_augmentation = True
num_predictions = 20
save_dir = os.path.join(os.getcwd(), 'saved_models')
model_name = 'keras_cifar10_trained_model.h5'

# The data, split between train and test sets:
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# Convert class vectors to binary class matrices.
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

model = Sequential()
model.add(Conv2D(32, (3, 3), padding='same',
input_shape=x_train.shape[1:]))
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, (3, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

# let's train the model using SGD + momentum (how original). EDIT: now with weight normalization, so slightly more original ;-)
from weightnorm import SGDWithWeightnorm, AdamWithWeightnorm
opt_wn = SGDWithWeightnorm(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
#opt_wn = AdamWithWeightnorm(lr=0.001, decay=1e-6)

# Let's train the model using RMSprop
model.compile(loss='categorical_crossentropy',
optimizer=opt_wn,
metrics=['accuracy'])

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255

# data based initialization of parameters
from weightnorm import data_based_init
data_based_init(model, x_train[:100])

if not data_augmentation:
print('Not using data augmentation.')
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
validation_data=(x_test, y_test),
shuffle=True)
else:
print('Using real-time data augmentation.')
# This will do preprocessing and realtime data augmentation:
datagen = ImageDataGenerator(
featurewise_center=False, # set input mean to 0 over the dataset
samplewise_center=False, # set each sample mean to 0
featurewise_std_normalization=False, # divide inputs by std of the dataset
samplewise_std_normalization=False, # divide each input by its std
zca_whitening=False, # apply ZCA whitening
zca_epsilon=1e-06, # epsilon for ZCA whitening
rotation_range=0, # randomly rotate images in the range (degrees, 0 to 180)
# randomly shift images horizontally (fraction of total width)
width_shift_range=0.1,
# randomly shift images vertically (fraction of total height)
height_shift_range=0.1,
shear_range=0., # set range for random shear
zoom_range=0., # set range for random zoom
channel_shift_range=0., # set range for random channel shifts
# set mode for filling points outside the input boundaries
fill_mode='nearest',
cval=0., # value used for fill_mode = "constant"
horizontal_flip=True, # randomly flip images
vertical_flip=False, # randomly flip images
# set rescaling factor (applied before any other transformation)
rescale=None,
# set function that will be applied on each input
preprocessing_function=None,
# image data format, either "channels_first" or "channels_last"
data_format=None,
# fraction of images reserved for validation (strictly between 0 and 1)
validation_split=0.0)

# Compute quantities required for feature-wise normalization
# (std, mean, and principal components if ZCA whitening is applied).
datagen.fit(x_train)

# Fit the model on the batches generated by datagen.flow().
model.fit_generator(datagen.flow(x_train, y_train,
batch_size=batch_size),
epochs=epochs,
steps_per_epoch=len(x_train)/batch_size,
validation_data=(x_test, y_test),
workers=4)

# Save model and weights
if not os.path.isdir(save_dir):
os.makedirs(save_dir)
model_path = os.path.join(save_dir, model_name)
model.save(model_path)
print('Saved trained model at %s ' % model_path)

# Score trained model.
scores = model.evaluate(x_test, y_test, verbose=1)
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])
208 changes: 208 additions & 0 deletions keras_2/weightnorm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
from keras import backend as K
from keras.optimizers import SGD,Adam
import tensorflow as tf

# adapted from keras.optimizers.SGD
class SGDWithWeightnorm(SGD):
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = []

lr = self.lr
if self.initial_decay > 0:
lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.floatx())))
self.updates .append(K.update_add(self.iterations, 1))

# momentum
shapes = [K.get_variable_shape(p) for p in params]
moments = [K.zeros(shape) for shape in shapes]
self.weights = [self.iterations] + moments
for p, g, m in zip(params, grads, moments):

# if a weight tensor (len > 1) use weight normalized parameterization
ps = K.get_variable_shape(p)
if len(ps) > 1:

# get weight normalization parameters
V, V_norm, V_scaler, g_param, grad_g, grad_V = get_weightnorm_params_and_grads(p, g)

# momentum container for the 'g' parameter
V_scaler_shape = K.get_variable_shape(V_scaler)
m_g = K.zeros(V_scaler_shape)

# update g parameters
v_g = self.momentum * m_g - lr * grad_g # velocity
self.updates.append(K.update(m_g, v_g))
if self.nesterov:
new_g_param = g_param + self.momentum * v_g - lr * grad_g
else:
new_g_param = g_param + v_g

# update V parameters
v_v = self.momentum * m - lr * grad_V # velocity
self.updates.append(K.update(m, v_v))
if self.nesterov:
new_V_param = V + self.momentum * v_v - lr * grad_V
else:
new_V_param = V + v_v

# if there are constraints we apply them to V, not W
if getattr(p, 'constraint', None) is not None:
new_V_param = p.constraint(new_V_param)

# wn param updates --> W updates
add_weightnorm_param_updates(self.updates, new_V_param, new_g_param, p, V_scaler)

else: # normal SGD with momentum
v = self.momentum * m - lr * g # velocity
self.updates.append(K.update(m, v))

if self.nesterov:
new_p = p + self.momentum * v - lr * g
else:
new_p = p + v

# apply constraints
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)

self.updates.append(K.update(p, new_p))
return self.updates

# adapted from keras.optimizers.Adam
class AdamWithWeightnorm(Adam):
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = [K.update_add(self.iterations, 1)]

lr = self.lr
if self.initial_decay > 0:
lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.floatx())))

t = K.cast(self.iterations + 1, K.floatx())
lr_t = lr * K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))

shapes = [K.get_variable_shape(p) for p in params]
ms = [K.zeros(shape) for shape in shapes]
vs = [K.zeros(shape) for shape in shapes]
self.weights = [self.iterations] + ms + vs

for p, g, m, v in zip(params, grads, ms, vs):

# if a weight tensor (len > 1) use weight normalized parameterization
# this is the only part changed w.r.t. keras.optimizers.Adam
ps = K.get_variable_shape(p)
if len(ps)>1:

# get weight normalization parameters
V, V_norm, V_scaler, g_param, grad_g, grad_V = get_weightnorm_params_and_grads(p, g)

# Adam containers for the 'g' parameter
V_scaler_shape = K.get_variable_shape(V_scaler)
m_g = K.zeros(V_scaler_shape)
v_g = K.zeros(V_scaler_shape)

# update g parameters
m_g_t = (self.beta_1 * m_g) + (1. - self.beta_1) * grad_g
v_g_t = (self.beta_2 * v_g) + (1. - self.beta_2) * K.square(grad_g)
new_g_param = g_param - lr_t * m_g_t / (K.sqrt(v_g_t) + self.epsilon)
self.updates.append(K.update(m_g, m_g_t))
self.updates.append(K.update(v_g, v_g_t))

# update V parameters
m_t = (self.beta_1 * m) + (1. - self.beta_1) * grad_V
v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(grad_V)
new_V_param = V - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
self.updates.append(K.update(m, m_t))
self.updates.append(K.update(v, v_t))

# if there are constraints we apply them to V, not W
if getattr(p, 'constraint', None) is not None:
new_V_param = p.constraint(new_V_param)

# wn param updates --> W updates
add_weightnorm_param_updates(self.updates, new_V_param, new_g_param, p, V_scaler)

else: # do optimization normally
m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

self.updates.append(K.update(m, m_t))
self.updates.append(K.update(v, v_t))

new_p = p_t
# apply constraints
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(K.update(p, new_p))
return self.updates


def get_weightnorm_params_and_grads(p, g):
ps = K.get_variable_shape(p)

# construct weight scaler: V_scaler = g/||V||
V_scaler_shape = (ps[-1],) # assumes we're using tensorflow!
V_scaler = K.ones(V_scaler_shape) # init to ones, so effective parameters don't change

# get V parameters = ||V||/g * W
norm_axes = [i for i in range(len(ps) - 1)]
V = p / tf.reshape(V_scaler, [1] * len(norm_axes) + [-1])

# split V_scaler into ||V|| and g parameters
V_norm = tf.sqrt(tf.reduce_sum(tf.square(V), norm_axes))
g_param = V_scaler * V_norm

# get grad in V,g parameters
grad_g = tf.reduce_sum(g * V, norm_axes) / V_norm
grad_V = tf.reshape(V_scaler, [1] * len(norm_axes) + [-1]) * \
(g - tf.reshape(grad_g / V_norm, [1] * len(norm_axes) + [-1]) * V)

return V, V_norm, V_scaler, g_param, grad_g, grad_V


def add_weightnorm_param_updates(updates, new_V_param, new_g_param, W, V_scaler):
ps = K.get_variable_shape(new_V_param)
norm_axes = [i for i in range(len(ps) - 1)]

# update W and V_scaler
new_V_norm = tf.sqrt(tf.reduce_sum(tf.square(new_V_param), norm_axes))
new_V_scaler = new_g_param / new_V_norm
new_W = tf.reshape(new_V_scaler, [1] * len(norm_axes) + [-1]) * new_V_param
updates.append(K.update(W, new_W))
updates.append(K.update(V_scaler, new_V_scaler))


# data based initialization for a given Keras model
def data_based_init(model, input):

# input can be dict, numpy array, or list of numpy arrays
if type(input) is dict:
feed_dict = input
elif type(input) is list:
feed_dict = {tf_inp: np_inp for tf_inp,np_inp in zip(model.inputs,input)}
else:
feed_dict = {model.inputs[0]: input}

# add learning phase if required
if model.uses_learning_phase and K.learning_phase() not in feed_dict:
feed_dict.update({K.learning_phase(): 1})

# get all layer name, output, weight, bias tuples
layer_output_weight_bias = []
for l in model.layers:
trainable_weights = l.trainable_weights
if len(trainable_weights) == 2:
W,b = trainable_weights
assert(l.built)
layer_output_weight_bias.append((l.name,l.get_output_at(0),W,b)) # if more than one node, only use the first

# iterate over our list and do data dependent init
sess = K.get_session()
for l,o,W,b in layer_output_weight_bias:
print('Performing data dependent initialization for layer ' + l)
m,v = tf.nn.moments(o, [i for i in range(len(o.get_shape())-1)])
s = tf.sqrt(v + 1e-10)
updates = tf.group(W.assign(W/tf.reshape(s,[1]*(len(W.get_shape())-1)+[-1])), b.assign((b-m)/s))
sess.run(updates, feed_dict)