diff --git a/keras/README.md b/keras_1/README.md
similarity index 100%
rename from keras/README.md
rename to keras_1/README.md
diff --git a/keras/cifar10_cnn.py b/keras_1/cifar10_cnn.py
similarity index 100%
rename from keras/cifar10_cnn.py
rename to keras_1/cifar10_cnn.py
diff --git a/keras/weightnorm.py b/keras_1/weightnorm.py
similarity index 100%
rename from keras/weightnorm.py
rename to keras_1/weightnorm.py
diff --git a/keras_2/README.md b/keras_2/README.md
new file mode 100644
index 0000000..b59eeae
--- /dev/null
+++ b/keras_2/README.md
@@ -0,0 +1,6 @@
+
+# Weight Normalization using Keras
+
+Example code for using Weight Normalization using [Keras](https://keras.io).
+
+```cifar10_cnn.py``` contains the standard CIFAR-10 example from Keras, with lines 64 and 69 edited to include weight normalization and data dependent initialization.
\ No newline at end of file
diff --git a/keras_2/cifar10_cnn.py b/keras_2/cifar10_cnn.py
new file mode 100644
index 0000000..8dd51bc
--- /dev/null
+++ b/keras_2/cifar10_cnn.py
@@ -0,0 +1,145 @@
+'''
+CIFAR-10 example from https://github.com/keras-team/keras/blob/master/examples/cifar10_cnn.py
+Now with weight normalization. Lines 64-65 and 78-79 contain the changes w.r.t. original.
+'''
+
+from __future__ import print_function
+import keras
+from keras.datasets import cifar10
+from keras.preprocessing.image import ImageDataGenerator
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Activation, Flatten
+from keras.layers import Conv2D, MaxPooling2D
+import os
+
+'''Train a simple deep CNN on the CIFAR10 small images dataset.
+
+It gets to 75% validation accuracy in 25 epochs, and 79% after 50 epochs.
+(it's still underfitting at that point, though).
+
+With weight normalization, a validation accuracy of 75% is already reached
+after 10 epochs.
+'''
+
+batch_size = 32
+num_classes = 10
+epochs = 100
+data_augmentation = True
+num_predictions = 20
+save_dir = os.path.join(os.getcwd(), 'saved_models')
+model_name = 'keras_cifar10_trained_model.h5'
+
+# The data, split between train and test sets:
+(x_train, y_train), (x_test, y_test) = cifar10.load_data()
+print('x_train shape:', x_train.shape)
+print(x_train.shape[0], 'train samples')
+print(x_test.shape[0], 'test samples')
+
+# Convert class vectors to binary class matrices.
+y_train = keras.utils.to_categorical(y_train, num_classes)
+y_test = keras.utils.to_categorical(y_test, num_classes)
+
+model = Sequential()
+model.add(Conv2D(32, (3, 3), padding='same',
+                 input_shape=x_train.shape[1:]))
+model.add(Activation('relu'))
+model.add(Conv2D(32, (3, 3)))
+model.add(Activation('relu'))
+model.add(MaxPooling2D(pool_size=(2, 2)))
+model.add(Dropout(0.25))
+
+model.add(Conv2D(64, (3, 3), padding='same'))
+model.add(Activation('relu'))
+model.add(Conv2D(64, (3, 3)))
+model.add(Activation('relu'))
+model.add(MaxPooling2D(pool_size=(2, 2)))
+model.add(Dropout(0.25))
+
+model.add(Flatten())
+model.add(Dense(512))
+model.add(Activation('relu'))
+model.add(Dropout(0.5))
+model.add(Dense(num_classes))
+model.add(Activation('softmax'))
+
+# let's train the model using SGD + momentum (how original). EDIT: now with weight normalization, so slightly more original ;-)
+from weightnorm import SGDWithWeightnorm, AdamWithWeightnorm
+opt_wn = SGDWithWeightnorm(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
+#opt_wn = AdamWithWeightnorm(lr=0.001, decay=1e-6)
+
+# Let's train the model using RMSprop
+model.compile(loss='categorical_crossentropy',
+              optimizer=opt_wn,
+              metrics=['accuracy'])
+
+x_train = x_train.astype('float32')
+x_test = x_test.astype('float32')
+x_train /= 255
+x_test /= 255
+
+# data based initialization of parameters
+from weightnorm import data_based_init
+data_based_init(model, x_train[:100])
+
+if not data_augmentation:
+    print('Not using data augmentation.')
+    model.fit(x_train, y_train,
+              batch_size=batch_size,
+              epochs=epochs,
+              validation_data=(x_test, y_test),
+              shuffle=True)
+else:
+    print('Using real-time data augmentation.')
+    # This will do preprocessing and realtime data augmentation:
+    datagen = ImageDataGenerator(
+        featurewise_center=False,  # set input mean to 0 over the dataset
+        samplewise_center=False,  # set each sample mean to 0
+        featurewise_std_normalization=False,  # divide inputs by std of the dataset
+        samplewise_std_normalization=False,  # divide each input by its std
+        zca_whitening=False,  # apply ZCA whitening
+        zca_epsilon=1e-06,  # epsilon for ZCA whitening
+        rotation_range=0,  # randomly rotate images in the range (degrees, 0 to 180)
+        # randomly shift images horizontally (fraction of total width)
+        width_shift_range=0.1,
+        # randomly shift images vertically (fraction of total height)
+        height_shift_range=0.1,
+        shear_range=0.,  # set range for random shear
+        zoom_range=0.,  # set range for random zoom
+        channel_shift_range=0.,  # set range for random channel shifts
+        # set mode for filling points outside the input boundaries
+        fill_mode='nearest',
+        cval=0.,  # value used for fill_mode = "constant"
+        horizontal_flip=True,  # randomly flip images
+        vertical_flip=False,  # randomly flip images
+        # set rescaling factor (applied before any other transformation)
+        rescale=None,
+        # set function that will be applied on each input
+        preprocessing_function=None,
+        # image data format, either "channels_first" or "channels_last"
+        data_format=None,
+        # fraction of images reserved for validation (strictly between 0 and 1)
+        validation_split=0.0)
+
+    # Compute quantities required for feature-wise normalization
+    # (std, mean, and principal components if ZCA whitening is applied).
+    datagen.fit(x_train)
+
+    # Fit the model on the batches generated by datagen.flow().
+    model.fit_generator(datagen.flow(x_train, y_train,
+                                     batch_size=batch_size),
+                                     epochs=epochs,
+                                     steps_per_epoch=len(x_train)/batch_size,
+                                     validation_data=(x_test, y_test),
+                                     workers=4)
+
+# Save model and weights
+if not os.path.isdir(save_dir):
+    os.makedirs(save_dir)
+model_path = os.path.join(save_dir, model_name)
+model.save(model_path)
+print('Saved trained model at %s ' % model_path)
+
+# Score trained model.
+scores = model.evaluate(x_test, y_test, verbose=1)
+print('Test loss:', scores[0])
+print('Test accuracy:', scores[1])
\ No newline at end of file
diff --git a/keras_2/weightnorm.py b/keras_2/weightnorm.py
new file mode 100644
index 0000000..dec50b1
--- /dev/null
+++ b/keras_2/weightnorm.py
@@ -0,0 +1,208 @@
+from keras import backend as K
+from keras.optimizers import SGD,Adam
+import tensorflow as tf
+
+# adapted from keras.optimizers.SGD
+class SGDWithWeightnorm(SGD):
+    def get_updates(self, loss, params):
+        grads = self.get_gradients(loss, params)
+        self.updates = []
+
+        lr = self.lr
+        if self.initial_decay > 0:
+            lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.floatx())))
+            self.updates .append(K.update_add(self.iterations, 1))
+
+        # momentum
+        shapes = [K.get_variable_shape(p) for p in params]
+        moments = [K.zeros(shape) for shape in shapes]
+        self.weights = [self.iterations] + moments
+        for p, g, m in zip(params, grads, moments):
+
+            # if a weight tensor (len > 1) use weight normalized parameterization
+            ps = K.get_variable_shape(p)
+            if len(ps) > 1:
+
+                # get weight normalization parameters
+                V, V_norm, V_scaler, g_param, grad_g, grad_V = get_weightnorm_params_and_grads(p, g)
+
+                # momentum container for the 'g' parameter
+                V_scaler_shape = K.get_variable_shape(V_scaler)
+                m_g = K.zeros(V_scaler_shape)
+
+                # update g parameters
+                v_g = self.momentum * m_g - lr * grad_g  # velocity
+                self.updates.append(K.update(m_g, v_g))
+                if self.nesterov:
+                    new_g_param = g_param + self.momentum * v_g - lr * grad_g
+                else:
+                    new_g_param = g_param + v_g
+
+                # update V parameters
+                v_v = self.momentum * m - lr * grad_V  # velocity
+                self.updates.append(K.update(m, v_v))
+                if self.nesterov:
+                    new_V_param = V + self.momentum * v_v - lr * grad_V
+                else:
+                    new_V_param = V + v_v
+
+                # if there are constraints we apply them to V, not W
+                if getattr(p, 'constraint', None) is not None:
+                    new_V_param = p.constraint(new_V_param)
+
+                # wn param updates --> W updates
+                add_weightnorm_param_updates(self.updates, new_V_param, new_g_param, p, V_scaler)
+
+            else: # normal SGD with momentum
+                v = self.momentum * m - lr * g  # velocity
+                self.updates.append(K.update(m, v))
+
+                if self.nesterov:
+                    new_p = p + self.momentum * v - lr * g
+                else:
+                    new_p = p + v
+
+                # apply constraints
+                if getattr(p, 'constraint', None) is not None:
+                    new_p = p.constraint(new_p)
+
+                self.updates.append(K.update(p, new_p))
+        return self.updates
+
+# adapted from keras.optimizers.Adam
+class AdamWithWeightnorm(Adam):
+    def get_updates(self, loss, params):
+        grads = self.get_gradients(loss, params)
+        self.updates = [K.update_add(self.iterations, 1)]
+
+        lr = self.lr
+        if self.initial_decay > 0:
+            lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.floatx())))
+
+        t = K.cast(self.iterations + 1, K.floatx())
+        lr_t = lr * K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))
+
+        shapes = [K.get_variable_shape(p) for p in params]
+        ms = [K.zeros(shape) for shape in shapes]
+        vs = [K.zeros(shape) for shape in shapes]
+        self.weights = [self.iterations] + ms + vs
+
+        for p, g, m, v in zip(params, grads, ms, vs):
+
+            # if a weight tensor (len > 1) use weight normalized parameterization
+            # this is the only part changed w.r.t. keras.optimizers.Adam
+            ps = K.get_variable_shape(p)
+            if len(ps)>1:
+
+                # get weight normalization parameters
+                V, V_norm, V_scaler, g_param, grad_g, grad_V = get_weightnorm_params_and_grads(p, g)
+
+                # Adam containers for the 'g' parameter
+                V_scaler_shape = K.get_variable_shape(V_scaler)
+                m_g = K.zeros(V_scaler_shape)
+                v_g = K.zeros(V_scaler_shape)
+
+                # update g parameters
+                m_g_t = (self.beta_1 * m_g) + (1. - self.beta_1) * grad_g
+                v_g_t = (self.beta_2 * v_g) + (1. - self.beta_2) * K.square(grad_g)
+                new_g_param = g_param - lr_t * m_g_t / (K.sqrt(v_g_t) + self.epsilon)
+                self.updates.append(K.update(m_g, m_g_t))
+                self.updates.append(K.update(v_g, v_g_t))
+
+                # update V parameters
+                m_t = (self.beta_1 * m) + (1. - self.beta_1) * grad_V
+                v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(grad_V)
+                new_V_param = V - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
+                self.updates.append(K.update(m, m_t))
+                self.updates.append(K.update(v, v_t))
+
+                # if there are constraints we apply them to V, not W
+                if getattr(p, 'constraint', None) is not None:
+                    new_V_param = p.constraint(new_V_param)
+
+                # wn param updates --> W updates
+                add_weightnorm_param_updates(self.updates, new_V_param, new_g_param, p, V_scaler)
+
+            else: # do optimization normally
+                m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
+                v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
+                p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
+
+                self.updates.append(K.update(m, m_t))
+                self.updates.append(K.update(v, v_t))
+
+                new_p = p_t
+                # apply constraints
+                if getattr(p, 'constraint', None) is not None:
+                    new_p = p.constraint(new_p)
+                self.updates.append(K.update(p, new_p))
+        return self.updates
+
+
+def get_weightnorm_params_and_grads(p, g):
+    ps = K.get_variable_shape(p)
+
+    # construct weight scaler: V_scaler = g/||V||
+    V_scaler_shape = (ps[-1],)  # assumes we're using tensorflow!
+    V_scaler = K.ones(V_scaler_shape)  # init to ones, so effective parameters don't change
+
+    # get V parameters = ||V||/g * W
+    norm_axes = [i for i in range(len(ps) - 1)]
+    V = p / tf.reshape(V_scaler, [1] * len(norm_axes) + [-1])
+
+    # split V_scaler into ||V|| and g parameters
+    V_norm = tf.sqrt(tf.reduce_sum(tf.square(V), norm_axes))
+    g_param = V_scaler * V_norm
+
+    # get grad in V,g parameters
+    grad_g = tf.reduce_sum(g * V, norm_axes) / V_norm
+    grad_V = tf.reshape(V_scaler, [1] * len(norm_axes) + [-1]) * \
+             (g - tf.reshape(grad_g / V_norm, [1] * len(norm_axes) + [-1]) * V)
+
+    return V, V_norm, V_scaler, g_param, grad_g, grad_V
+
+
+def add_weightnorm_param_updates(updates, new_V_param, new_g_param, W, V_scaler):
+    ps = K.get_variable_shape(new_V_param)
+    norm_axes = [i for i in range(len(ps) - 1)]
+
+    # update W and V_scaler
+    new_V_norm = tf.sqrt(tf.reduce_sum(tf.square(new_V_param), norm_axes))
+    new_V_scaler = new_g_param / new_V_norm
+    new_W = tf.reshape(new_V_scaler, [1] * len(norm_axes) + [-1]) * new_V_param
+    updates.append(K.update(W, new_W))
+    updates.append(K.update(V_scaler, new_V_scaler))
+
+
+# data based initialization for a given Keras model
+def data_based_init(model, input):
+
+    # input can be dict, numpy array, or list of numpy arrays
+    if type(input) is dict:
+        feed_dict = input
+    elif type(input) is list:
+        feed_dict = {tf_inp: np_inp for tf_inp,np_inp in zip(model.inputs,input)}
+    else:
+        feed_dict = {model.inputs[0]: input}
+
+    # add learning phase if required
+    if model.uses_learning_phase and K.learning_phase() not in feed_dict:
+        feed_dict.update({K.learning_phase(): 1})
+
+    # get all layer name, output, weight, bias tuples
+    layer_output_weight_bias = []
+    for l in model.layers:
+        trainable_weights = l.trainable_weights
+        if len(trainable_weights) == 2:
+            W,b = trainable_weights
+            assert(l.built)
+            layer_output_weight_bias.append((l.name,l.get_output_at(0),W,b)) # if more than one node, only use the first
+
+    # iterate over our list and do data dependent init
+    sess = K.get_session()
+    for l,o,W,b in layer_output_weight_bias:
+        print('Performing data dependent initialization for layer ' + l)
+        m,v = tf.nn.moments(o, [i for i in range(len(o.get_shape())-1)])
+        s = tf.sqrt(v + 1e-10)
+        updates = tf.group(W.assign(W/tf.reshape(s,[1]*(len(W.get_shape())-1)+[-1])), b.assign((b-m)/s))
+        sess.run(updates, feed_dict)