diff --git a/keras/README.md b/keras_1/README.md similarity index 100% rename from keras/README.md rename to keras_1/README.md diff --git a/keras/cifar10_cnn.py b/keras_1/cifar10_cnn.py similarity index 100% rename from keras/cifar10_cnn.py rename to keras_1/cifar10_cnn.py diff --git a/keras/weightnorm.py b/keras_1/weightnorm.py similarity index 100% rename from keras/weightnorm.py rename to keras_1/weightnorm.py diff --git a/keras_2/README.md b/keras_2/README.md new file mode 100644 index 0000000..b59eeae --- /dev/null +++ b/keras_2/README.md @@ -0,0 +1,6 @@ + +# Weight Normalization using Keras + +Example code for using Weight Normalization using [Keras](https://keras.io). + +```cifar10_cnn.py``` contains the standard CIFAR-10 example from Keras, with lines 64 and 69 edited to include weight normalization and data dependent initialization. \ No newline at end of file diff --git a/keras_2/cifar10_cnn.py b/keras_2/cifar10_cnn.py new file mode 100644 index 0000000..8dd51bc --- /dev/null +++ b/keras_2/cifar10_cnn.py @@ -0,0 +1,145 @@ +''' +CIFAR-10 example from https://github.com/keras-team/keras/blob/master/examples/cifar10_cnn.py +Now with weight normalization. Lines 64-65 and 78-79 contain the changes w.r.t. original. +''' + +from __future__ import print_function +import keras +from keras.datasets import cifar10 +from keras.preprocessing.image import ImageDataGenerator +from keras.models import Sequential +from keras.layers import Dense, Dropout, Activation, Flatten +from keras.layers import Conv2D, MaxPooling2D +import os + +'''Train a simple deep CNN on the CIFAR10 small images dataset. + +It gets to 75% validation accuracy in 25 epochs, and 79% after 50 epochs. +(it's still underfitting at that point, though). + +With weight normalization, a validation accuracy of 75% is already reached +after 10 epochs. +''' + +batch_size = 32 +num_classes = 10 +epochs = 100 +data_augmentation = True +num_predictions = 20 +save_dir = os.path.join(os.getcwd(), 'saved_models') +model_name = 'keras_cifar10_trained_model.h5' + +# The data, split between train and test sets: +(x_train, y_train), (x_test, y_test) = cifar10.load_data() +print('x_train shape:', x_train.shape) +print(x_train.shape[0], 'train samples') +print(x_test.shape[0], 'test samples') + +# Convert class vectors to binary class matrices. +y_train = keras.utils.to_categorical(y_train, num_classes) +y_test = keras.utils.to_categorical(y_test, num_classes) + +model = Sequential() +model.add(Conv2D(32, (3, 3), padding='same', + input_shape=x_train.shape[1:])) +model.add(Activation('relu')) +model.add(Conv2D(32, (3, 3))) +model.add(Activation('relu')) +model.add(MaxPooling2D(pool_size=(2, 2))) +model.add(Dropout(0.25)) + +model.add(Conv2D(64, (3, 3), padding='same')) +model.add(Activation('relu')) +model.add(Conv2D(64, (3, 3))) +model.add(Activation('relu')) +model.add(MaxPooling2D(pool_size=(2, 2))) +model.add(Dropout(0.25)) + +model.add(Flatten()) +model.add(Dense(512)) +model.add(Activation('relu')) +model.add(Dropout(0.5)) +model.add(Dense(num_classes)) +model.add(Activation('softmax')) + +# let's train the model using SGD + momentum (how original). EDIT: now with weight normalization, so slightly more original ;-) +from weightnorm import SGDWithWeightnorm, AdamWithWeightnorm +opt_wn = SGDWithWeightnorm(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) +#opt_wn = AdamWithWeightnorm(lr=0.001, decay=1e-6) + +# Let's train the model using RMSprop +model.compile(loss='categorical_crossentropy', + optimizer=opt_wn, + metrics=['accuracy']) + +x_train = x_train.astype('float32') +x_test = x_test.astype('float32') +x_train /= 255 +x_test /= 255 + +# data based initialization of parameters +from weightnorm import data_based_init +data_based_init(model, x_train[:100]) + +if not data_augmentation: + print('Not using data augmentation.') + model.fit(x_train, y_train, + batch_size=batch_size, + epochs=epochs, + validation_data=(x_test, y_test), + shuffle=True) +else: + print('Using real-time data augmentation.') + # This will do preprocessing and realtime data augmentation: + datagen = ImageDataGenerator( + featurewise_center=False, # set input mean to 0 over the dataset + samplewise_center=False, # set each sample mean to 0 + featurewise_std_normalization=False, # divide inputs by std of the dataset + samplewise_std_normalization=False, # divide each input by its std + zca_whitening=False, # apply ZCA whitening + zca_epsilon=1e-06, # epsilon for ZCA whitening + rotation_range=0, # randomly rotate images in the range (degrees, 0 to 180) + # randomly shift images horizontally (fraction of total width) + width_shift_range=0.1, + # randomly shift images vertically (fraction of total height) + height_shift_range=0.1, + shear_range=0., # set range for random shear + zoom_range=0., # set range for random zoom + channel_shift_range=0., # set range for random channel shifts + # set mode for filling points outside the input boundaries + fill_mode='nearest', + cval=0., # value used for fill_mode = "constant" + horizontal_flip=True, # randomly flip images + vertical_flip=False, # randomly flip images + # set rescaling factor (applied before any other transformation) + rescale=None, + # set function that will be applied on each input + preprocessing_function=None, + # image data format, either "channels_first" or "channels_last" + data_format=None, + # fraction of images reserved for validation (strictly between 0 and 1) + validation_split=0.0) + + # Compute quantities required for feature-wise normalization + # (std, mean, and principal components if ZCA whitening is applied). + datagen.fit(x_train) + + # Fit the model on the batches generated by datagen.flow(). + model.fit_generator(datagen.flow(x_train, y_train, + batch_size=batch_size), + epochs=epochs, + steps_per_epoch=len(x_train)/batch_size, + validation_data=(x_test, y_test), + workers=4) + +# Save model and weights +if not os.path.isdir(save_dir): + os.makedirs(save_dir) +model_path = os.path.join(save_dir, model_name) +model.save(model_path) +print('Saved trained model at %s ' % model_path) + +# Score trained model. +scores = model.evaluate(x_test, y_test, verbose=1) +print('Test loss:', scores[0]) +print('Test accuracy:', scores[1]) \ No newline at end of file diff --git a/keras_2/weightnorm.py b/keras_2/weightnorm.py new file mode 100644 index 0000000..dec50b1 --- /dev/null +++ b/keras_2/weightnorm.py @@ -0,0 +1,208 @@ +from keras import backend as K +from keras.optimizers import SGD,Adam +import tensorflow as tf + +# adapted from keras.optimizers.SGD +class SGDWithWeightnorm(SGD): + def get_updates(self, loss, params): + grads = self.get_gradients(loss, params) + self.updates = [] + + lr = self.lr + if self.initial_decay > 0: + lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.floatx()))) + self.updates .append(K.update_add(self.iterations, 1)) + + # momentum + shapes = [K.get_variable_shape(p) for p in params] + moments = [K.zeros(shape) for shape in shapes] + self.weights = [self.iterations] + moments + for p, g, m in zip(params, grads, moments): + + # if a weight tensor (len > 1) use weight normalized parameterization + ps = K.get_variable_shape(p) + if len(ps) > 1: + + # get weight normalization parameters + V, V_norm, V_scaler, g_param, grad_g, grad_V = get_weightnorm_params_and_grads(p, g) + + # momentum container for the 'g' parameter + V_scaler_shape = K.get_variable_shape(V_scaler) + m_g = K.zeros(V_scaler_shape) + + # update g parameters + v_g = self.momentum * m_g - lr * grad_g # velocity + self.updates.append(K.update(m_g, v_g)) + if self.nesterov: + new_g_param = g_param + self.momentum * v_g - lr * grad_g + else: + new_g_param = g_param + v_g + + # update V parameters + v_v = self.momentum * m - lr * grad_V # velocity + self.updates.append(K.update(m, v_v)) + if self.nesterov: + new_V_param = V + self.momentum * v_v - lr * grad_V + else: + new_V_param = V + v_v + + # if there are constraints we apply them to V, not W + if getattr(p, 'constraint', None) is not None: + new_V_param = p.constraint(new_V_param) + + # wn param updates --> W updates + add_weightnorm_param_updates(self.updates, new_V_param, new_g_param, p, V_scaler) + + else: # normal SGD with momentum + v = self.momentum * m - lr * g # velocity + self.updates.append(K.update(m, v)) + + if self.nesterov: + new_p = p + self.momentum * v - lr * g + else: + new_p = p + v + + # apply constraints + if getattr(p, 'constraint', None) is not None: + new_p = p.constraint(new_p) + + self.updates.append(K.update(p, new_p)) + return self.updates + +# adapted from keras.optimizers.Adam +class AdamWithWeightnorm(Adam): + def get_updates(self, loss, params): + grads = self.get_gradients(loss, params) + self.updates = [K.update_add(self.iterations, 1)] + + lr = self.lr + if self.initial_decay > 0: + lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.floatx()))) + + t = K.cast(self.iterations + 1, K.floatx()) + lr_t = lr * K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t)) + + shapes = [K.get_variable_shape(p) for p in params] + ms = [K.zeros(shape) for shape in shapes] + vs = [K.zeros(shape) for shape in shapes] + self.weights = [self.iterations] + ms + vs + + for p, g, m, v in zip(params, grads, ms, vs): + + # if a weight tensor (len > 1) use weight normalized parameterization + # this is the only part changed w.r.t. keras.optimizers.Adam + ps = K.get_variable_shape(p) + if len(ps)>1: + + # get weight normalization parameters + V, V_norm, V_scaler, g_param, grad_g, grad_V = get_weightnorm_params_and_grads(p, g) + + # Adam containers for the 'g' parameter + V_scaler_shape = K.get_variable_shape(V_scaler) + m_g = K.zeros(V_scaler_shape) + v_g = K.zeros(V_scaler_shape) + + # update g parameters + m_g_t = (self.beta_1 * m_g) + (1. - self.beta_1) * grad_g + v_g_t = (self.beta_2 * v_g) + (1. - self.beta_2) * K.square(grad_g) + new_g_param = g_param - lr_t * m_g_t / (K.sqrt(v_g_t) + self.epsilon) + self.updates.append(K.update(m_g, m_g_t)) + self.updates.append(K.update(v_g, v_g_t)) + + # update V parameters + m_t = (self.beta_1 * m) + (1. - self.beta_1) * grad_V + v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(grad_V) + new_V_param = V - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) + self.updates.append(K.update(m, m_t)) + self.updates.append(K.update(v, v_t)) + + # if there are constraints we apply them to V, not W + if getattr(p, 'constraint', None) is not None: + new_V_param = p.constraint(new_V_param) + + # wn param updates --> W updates + add_weightnorm_param_updates(self.updates, new_V_param, new_g_param, p, V_scaler) + + else: # do optimization normally + m_t = (self.beta_1 * m) + (1. - self.beta_1) * g + v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) + p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) + + self.updates.append(K.update(m, m_t)) + self.updates.append(K.update(v, v_t)) + + new_p = p_t + # apply constraints + if getattr(p, 'constraint', None) is not None: + new_p = p.constraint(new_p) + self.updates.append(K.update(p, new_p)) + return self.updates + + +def get_weightnorm_params_and_grads(p, g): + ps = K.get_variable_shape(p) + + # construct weight scaler: V_scaler = g/||V|| + V_scaler_shape = (ps[-1],) # assumes we're using tensorflow! + V_scaler = K.ones(V_scaler_shape) # init to ones, so effective parameters don't change + + # get V parameters = ||V||/g * W + norm_axes = [i for i in range(len(ps) - 1)] + V = p / tf.reshape(V_scaler, [1] * len(norm_axes) + [-1]) + + # split V_scaler into ||V|| and g parameters + V_norm = tf.sqrt(tf.reduce_sum(tf.square(V), norm_axes)) + g_param = V_scaler * V_norm + + # get grad in V,g parameters + grad_g = tf.reduce_sum(g * V, norm_axes) / V_norm + grad_V = tf.reshape(V_scaler, [1] * len(norm_axes) + [-1]) * \ + (g - tf.reshape(grad_g / V_norm, [1] * len(norm_axes) + [-1]) * V) + + return V, V_norm, V_scaler, g_param, grad_g, grad_V + + +def add_weightnorm_param_updates(updates, new_V_param, new_g_param, W, V_scaler): + ps = K.get_variable_shape(new_V_param) + norm_axes = [i for i in range(len(ps) - 1)] + + # update W and V_scaler + new_V_norm = tf.sqrt(tf.reduce_sum(tf.square(new_V_param), norm_axes)) + new_V_scaler = new_g_param / new_V_norm + new_W = tf.reshape(new_V_scaler, [1] * len(norm_axes) + [-1]) * new_V_param + updates.append(K.update(W, new_W)) + updates.append(K.update(V_scaler, new_V_scaler)) + + +# data based initialization for a given Keras model +def data_based_init(model, input): + + # input can be dict, numpy array, or list of numpy arrays + if type(input) is dict: + feed_dict = input + elif type(input) is list: + feed_dict = {tf_inp: np_inp for tf_inp,np_inp in zip(model.inputs,input)} + else: + feed_dict = {model.inputs[0]: input} + + # add learning phase if required + if model.uses_learning_phase and K.learning_phase() not in feed_dict: + feed_dict.update({K.learning_phase(): 1}) + + # get all layer name, output, weight, bias tuples + layer_output_weight_bias = [] + for l in model.layers: + trainable_weights = l.trainable_weights + if len(trainable_weights) == 2: + W,b = trainable_weights + assert(l.built) + layer_output_weight_bias.append((l.name,l.get_output_at(0),W,b)) # if more than one node, only use the first + + # iterate over our list and do data dependent init + sess = K.get_session() + for l,o,W,b in layer_output_weight_bias: + print('Performing data dependent initialization for layer ' + l) + m,v = tf.nn.moments(o, [i for i in range(len(o.get_shape())-1)]) + s = tf.sqrt(v + 1e-10) + updates = tf.group(W.assign(W/tf.reshape(s,[1]*(len(W.get_shape())-1)+[-1])), b.assign((b-m)/s)) + sess.run(updates, feed_dict)