karpathy · vseledkin · Oct 9, 2015 · Oct 12, 2015 · Oct 23, 2015 · Dec 4, 2015
diff --git a/copy.lua b/copy.lua
@@ -0,0 +1,158 @@
+--[[
+
+  Training a NTM to memorize input.
+
+  The current version seems to work, giving good output after 5000 iterations
+  or so. Proper initialization of the read/write weights seems to be crucial
+  here.
+
+--]]
+
+require('../')
+require('./util')
+require('optim')
+require('sys')
+
+torch.manualSeed(0)
+
+-- NTM config
+local config = {
+  input_dim = 10,
+  output_dim = 10,
+  mem_rows = 128,
+  mem_cols = 20,
+  cont_dim = 100
+}
+
+local input_dim = config.input_dim
+local start_symbol = torch.zeros(input_dim)
+start_symbol[1] = 1
+local end_symbol = torch.zeros(input_dim)
+end_symbol[2] = 1
+
+function generate_sequence(len, bits)
+  local seq = torch.zeros(len, bits + 2)
+  for i = 1, len do
+    seq[{i, {3, bits + 2}}] = torch.rand(bits):round()
+  end
+  return seq
+end
+
+function forward(model, seq, print_flag)
+  local len = seq:size(1)
+  local loss = 0
+
+  -- present start symbol
+  model:forward(start_symbol)
+
+  -- present inputs
+  if print_flag then print('write head max') end
+  for j = 1, len do
+    model:forward(seq[j])
+    if print_flag then print_write_max(model) end
+  end
+
+  -- present end symbol
+  model:forward(end_symbol)
+
+  -- present targets
+  local zeros = torch.zeros(input_dim)
+  local outputs = torch.Tensor(len, input_dim)
+  local criteria = {}
+  if print_flag then print('read head max') end
+  for j = 1, len do
+    criteria[j] = nn.BCECriterion()
+    outputs[j] = model:forward(zeros)
+    loss = loss + criteria[j]:forward(outputs[j], seq[j]) * input_dim
+    if print_flag then print_read_max(model) end
+  end
+  return outputs, criteria, loss
+end
+
+function backward(model, seq, outputs, criteria)
+  local len = seq:size(1)
+  local zeros = torch.zeros(input_dim)
+  for j = len, 1, -1 do
+    model:backward(
+      zeros,
+      criteria[j]
+        :backward(outputs[j], seq[j])
+        :mul(input_dim)
+      )
+  end
+
+  model:backward(end_symbol, zeros)
+  for j = len, 1, -1 do
+    model:backward(seq[j], zeros)
+  end
+  model:backward(start_symbol, zeros)
+end
+
+local model = ntm.NTM(config)
+local params, grads = model:getParameters()
+
+local num_iters = 10000
+local start = sys.clock()
+local print_interval = 25
+local min_len = 1
+local max_len = 20
+
+print(string.rep('=', 80))
+print("NTM copy task")
+print('training up to ' .. num_iters .. ' iteration(s)')
+print('min sequence length = ' .. min_len)
+print('max sequence length = ' .. max_len)
+print(string.rep('=', 80))
+print('num params: ' .. params:size(1))
+
+local rmsprop_state = {
+  learningRate = 1e-4,
+  momentum = 0.9,
+  decay = 0.95
+}
+
+-- local adagrad_state = {
+--   learningRate = 1e-3
+-- }
+
+-- train
+for iter = 1, num_iters do
+  local print_flag = (iter % print_interval == 0)
+  local feval = function(x)
+    if print_flag then
+      print(string.rep('-', 80))
+      print('iter = ' .. iter)
+      print('learn rate = ' .. rmsprop_state.learningRate)
+      print('momentum = ' .. rmsprop_state.momentum)
+      print('decay = ' .. rmsprop_state.decay)
+      printf('t = %.1fs\n', sys.clock() - start)
+    end
+
+    local loss = 0
+    grads:zero()
+
+    local len = math.floor(torch.random(min_len, max_len))
+    local seq = generate_sequence(len, input_dim - 2)
+    local outputs, criteria, sample_loss = forward(model, seq, print_flag)
+    loss = loss + sample_loss
+    backward(model, seq, outputs, criteria)
+    if print_flag then
+      print("target:")
+      print(seq)
+      print("output:")
+      print(outputs)
+    end
+
+    -- clip gradients
+    grads:clamp(-10, 10)
+    if print_flag then
+      print('max grad = ' .. grads:max())
+      print('min grad = ' .. grads:min())
+      print('loss = ' .. loss)
+    end
+    return loss, grads
+  end
+
+  --optim.adagrad(feval, params, adagrad_state)
+  ntm.rmsprop(feval, params, rmsprop_state)
+end
diff --git a/lr.lua b/lr.lua
@@ -0,0 +1,4 @@
+local function lr()
+  return 0.008
+end
+return lr
diff --git a/model/CircularConvolution.lua b/model/CircularConvolution.lua
@@ -0,0 +1,96 @@
+--[[
+
+ Input: A table {x, k} of a vector x and a convolution kernel k.
+
+ Output: Circular convolution of x with k.
+
+ TODO: This module can probably be implemented more efficiently.
+
+--]]
+
+local CircularConvolution, parent = torch.class('nn.CircularConvolution', 'nn.Module')
+
+function CircularConvolution:__init()
+  parent.__init(self)
+  self.gradInput = {}
+end
+
+function rotate_left(input, step)
+  local output = input.new():resizeAs(input)
+  local size = input:size(1)
+  output[{{1, size - step}}] = input[{{step + 1, size}}]
+  output[{{size - step + 1, size}}] = input[{{1, step}}]
+  return output
+end
+
+function rotate_right(input, step)
+  local output = input.new():resizeAs(input)
+  local size = input:size(1)
+  output[{{step + 1, size}}] = input[{{1, size - step}}]
+  output[{{1, step}}] = input[{{size - step + 1, size}}]
+  return output
+end
+
+-- function CircularConvolution:updateOutput_orig(input)
+--   local a, b = unpack(input)
+--   local size = a:size(1)
+--   self.b = b:repeatTensor(1,2)
+--   local circ = a.new():resize(size, size)
+--   for i = 0, size - 1 do
+--     circ[i + 1] = self.b:narrow(2, size - i + 1, size)
+--   end
+--   self.output:set(torch.mv(circ:t(), a))
+--   return self.output
+-- end
+
+-- function CircularConvolution:updateGradInput_orig(input, gradOutput)
+--   local a, b = unpack(input)
+--   local size = a:size(1)
+--   for i = 1, 2 do
+--     self.gradInput[i] = self.gradInput[i] or input[1].new()
+--     self.gradInput[i]:resize(size)
+--   end
+
+--   a = a:repeatTensor(1, 2)
+--   for i = 0, size - 1 do
+--     self.gradInput[1][i + 1] = gradOutput:dot(self.b:narrow(2, size - i + 1, size))
+--     self.gradInput[2][i + 1] = gradOutput:dot(a:narrow(2, size - i + 1, size))
+--   end
+--   return self.gradInput
+-- end
+
+function CircularConvolution:updateOutput(input)
+  local v, k = unpack(input)
+  self.size = v:size(1)
+  self.kernel_size = k:size(1)
+  self.kernel_shift = math.floor(self.kernel_size / 2)
+  self.output = v.new():resize(self.size):zero()
+  for i = 1, self.size do
+    for j = 1, self.kernel_size do
+      local idx = i + self.kernel_shift - j + 1
+      if idx < 1 then idx = idx + self.size end
+      if idx > self.size then idx = idx - self.size end
+      self.output[{{i}}]:add(k[j] * v[idx])
+    end
+  end
+  return self.output
+end
+
+function CircularConvolution:updateGradInput(input, gradOutput)
+  local v, k = unpack(input)
+  self.gradInput[1] = self.gradInput[1] or v.new()
+  self.gradInput[2] = self.gradInput[2] or k.new()
+  self.gradInput[1]:resize(self.size)
+  self.gradInput[2]:resize(self.kernel_size)
+
+  local gradOutput2 = rotate_right(gradOutput:repeatTensor(1, 2):view(2 * self.size), self.kernel_shift)
+  for i = 1, self.size do
+    self.gradInput[1][i] = k:dot(gradOutput2:narrow(1, i, self.kernel_size))
+  end
+
+  local v2 = rotate_left(v:repeatTensor(1, 2):view(2 * self.size), self.kernel_shift + 1)
+  for i = 1, self.kernel_size do
+    self.gradInput[2][i] = gradOutput:dot(v2:narrow(1, self.size - i + 1, self.size))
+  end
+  return self.gradInput
+end
diff --git a/model/GRU.lua b/model/GRU.lua
@@ -6,7 +6,7 @@ Creates one timestep of one GRU
 Paper reference: http://arxiv.org/pdf/1412.3555v1.pdf
 ]]--
 function GRU.gru(input_size, rnn_size, n, dropout)
-  dropout = dropout or 0 
+  dropout = dropout or 0
   -- there are n+1 inputs (hiddens on each layer and x)
   local inputs = {}
   table.insert(inputs, nn.Identity()()) -- x
@@ -26,11 +26,12 @@ function GRU.gru(input_size, rnn_size, n, dropout)
 
     local prev_h = inputs[L+1]
     -- the input to this layer
-    if L == 1 then 
-      x = OneHot(input_size)(inputs[1])
+    if L == 1 then
+      print(input_size)
+      x = nn.LookupTable(input_size,rnn_size)(inputs[1])
       input_size_L = input_size
-    else 
-      x = outputs[(L-1)] 
+    else
+      x = outputs[(L-1)]
       if dropout > 0 then x = nn.Dropout(dropout)(x) end -- apply dropout, if any
       input_size_L = rnn_size
     end